1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146
| ''' Created on Jun 30, 2012
@Author: killua @E-mail: [email protected] @Homepage: http://www.yidooo.net
Data set download from : http://www.grouplens.org/system/files/ml-100k.zip
MovieLens data sets were collected by the GroupLens Research Project at the University of Minnesota.The data was collected through the MovieLens web site (movielens.umn.edu) during the seven-month period from September 19th, 1997 through April 22nd, 1998.
This data set consists of: * 100,000 ratings (1-5) from 943 users on 1682 movies. * Each user has rated at least 20 movies. * Simple demographic info for the users
u.data -- The full u data set, 100000 ratings by 943 users on 1682 items. Each user has rated at least 20 movies. Users and items are numbered consecutively from 1. The data is randomly ordered. This is a tab separated list of user id | item id | rating | timestamp. The time stamps are unix seconds since 1/1/1970 UTC u.item -- Information about the items (movies); this is a tab separated list of movie id | movie title | release date | video release date | IMDb URL | unknown | Action | Adventure | Animation | Children's | Comedy | Crime | Documentary | Drama | Fantasy | Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi | Thriller | War | Western | The last 19 fields are the genres, a 1 indicates the movie is of that genre, a 0 indicates it is not; movies can be in several genres at once. The movie ids are the ones used in the u.data data set. '''
from math import sqrt
def loadMovieData(path = "./data/"): """ Load movie data from u.data and u.item @param path: Data set path """ movies = {} for line in open(path + '/u.item'): (movieId, movieTitle) = line.split('|')[0:2] movies[movieId] = movieTitle
movieData = {} userData = {} for line in open(path + '/u.data'): (userId, itemId, rating, timestamp)=line.split('\t') movieData.setdefault(movies[itemId], {}) movieData[movies[itemId]][userId] = float(rating) userData.setdefault(userId, {}) userData[userId][movies[movieId]] = float(rating)
return (movieData, userData)
def euclidean(data, p1, p2): "Calculate Euclidean distance" distance = sum([pow(data[p1][item]-data[p2][item],2) for item in data[p1] if item in data[p2]])
return 1.0 / (1 + distance)
def pearson(data, p1, p2): "Calculate Pearson correlation coefficient" corrItems = [item for item in data[p1] if item in data[p2]]
n = len(corrItems) if n == 0: return 0;
sumX = sum([data[p1][item] for item in corrItems]) sumY = sum([data[p2][item] for item in corrItems]) sumXY = sum([data[p1][item] * data[p2][item] for item in corrItems]) sumXsq = sum([pow(data[p1][item], 2) for item in corrItems]) sumYsq = sum([pow(data[p2][item],2) for item in corrItems])
if sqrt((sumXsq - pow(sumX, 2) / n) * (sumYsq - pow(sumY, 2) / n)) != 0: pearson = (sumXY - sumX * sumY / n) / sqrt((sumXsq - pow(sumX, 2) / n) * (sumYsq - pow(sumY, 2) / n)) else: return 0
return pearson
def getSimilarItems(movieData, n = 20, similarity=pearson): """ Create a dictionary of items showing which other items they are most similar to. """
results = {} for movie in movieData: matches = [(similarity(movieData, movie, otherMovie),otherMovie) for otherMovie in movieData if movie != otherMovie] matches.sort() matches.reverse() results[movie] = matches[0:n]
return results
def getRecommendationsItems(userData, user, similarItems, n = 10): """ Get recommendations items for user """ userRatings = userData[user] itemScores = {} totalSim = {}
for (item, rating) in userRatings.items(): for (simValue, simItem) in similarItems[item]: if simItem in userRatings: continue itemScores.setdefault(simItem, 0) itemScores[simItem] += simValue * rating totalSim.setdefault(simItem, 0) totalSim[simItem] += simValue
rankings = [(score / totalSim[item], item) for (item, score) in itemScores.items() if totalSim[item] != 0] rankings.sort() rankings.reverse()
return rankings[0:n]
if __name__ == "__main__":
print 'Loading Data...' movieData, userData = loadMovieData("./movie/") print 'Get similarItems...' similarItems = getSimilarItems(movieData, 50, euclidean) print 'Calculate rankings...' rankings = getRecommendationsItems(userData, "87", similarItems)
print rankings
|