异度部落格

学习是一种生活态度。

0%

【试题描述】把一个数组最开始的若干个元素搬到末尾,我们称之为数组的旋转。输入一个递增排序的数组的一个旋转,输出旋转数组的小元素。例如数组{3,4,5,1,2}为{1,2,3,4,5}的一个旋转,该数组的最小元素为1.

【试题来源】未知

【参考代码】

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
#include <iostream>
using namespace std;

int findMinNumber(int* numbers, int length) {

if(numbers == NULL || length <= 0) {
throw "Invalid parameters.";
}

int index1 = 0;
int index2 = length - 1;
int indexMid = (index1 + index2) / 2;
while(numbers[index1] >= numbers[index2]) {

if(index2 - index1 == 1) {
return numbers[index2];
}

indexMid = (index1 + index2) / 2;
if(numbers[index1] == numbers[index2]
&& numbers[index1] == numbers[indexMid]) {
int result = numbers[index1 + 1];
for(int i = index1 + 1; i <= index2; ++i) {
if(result > numbers[i]) {
result = numbers[i];
}
}
} else if(numbers[indexMid] >= numbers[index1]) {
index1 = indexMid;
} else if(numbers[indexMid] <= numbers[index2]) {
index2 = indexMid;
}
}
return numbers[indexMid];
}

int main() {
int array[] = {3, 4, 5, 1, 2};
//int array[] = {3, 4, 4, 5, 5, 6, 7, 8, 1, 2, 2, 2, 3, 3, 3};
cout << findMinNumber(array, 5) << endl;
return 0;
}

【试题描述】用两个栈实现一个队列。队列声明如下,请实现它的两个函数appendTail,deleteHead,分别完成在队列尾部插入节点和在头部删除节点。

【试题来源】未知

【参考代码】

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#include <iostream>
#include <stack>
using namespace std;

template <typename T>
class MyQueue
{

public:
void appendTail(const T& element);
T deleteHead();

private:
stack<T> inStack;
stack<T> outStack;
};

template <typename T>
void MyQueue<T>::appendTail(const T& element) {

inStack.push(element);
}

template <typename T>
T MyQueue<T>::deleteHead() {

if(outStack.empty()) {
//将inStack里面的数据放入outStack中
if(!inStack.empty()) {
while(!inStack.empty()) {
T element = inStack.top();
outStack.push(element);
inStack.pop();
}
} else {
throw "The Queue is empty.";
}
}
T element = outStack.top();
outStack.pop();

return element;
}

int main() {
MyQueue<int> queue;
queue.appendTail(1);
queue.appendTail(3);
queue.appendTail(5);
cout << queue.deleteHead() << endl;
cout << queue.deleteHead() << endl;
cout << queue.deleteHead() << endl;

return 0;
}

最近准备重温 C++,正好看到有关类和占用空间的问题,于是整理了下。先上代码

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
#include <iostream>
using namespace std;

//空类型,没有任何成员
class ClassA {

};

//添加构造函数和析构函数
class ClassB {

public:
ClassB() {};
~ClassB() {};

};

//仅包含一个虚构函数
class ClassC {
virtual void fun() {};
};

int main() {

ClassA A;
cout << "ClassA:" << sizeof(A) << endl;

ClassB B;
cout << "ClassB:" << sizeof(B) << endl;

ClassC C;
cout << "ClassC:" << sizeof(C) << endl;

return 0;
}

结果:

1
2
3
ClassA:1
ClassB:1
ClassC:4

【分析】 1)ClassA 里面没有定义任何变量和函数,本应该是 0 的,但是声明变量需要占用一个字节,因此是 1。
2)ClassB 里面仅仅定义了构造函数和析构函数。调用构造函数和析构函数只需要知道地址即可,而这些函数的地址只与类型相关,而与实例无关,编译器不会因为这两个函数在实例中添加任何信息。
3)ClassC 中仅有一个虚函数。C++编译器一旦发现类型中有虚函数,就会为该类型生成虚函数表,并在该类型的每个实例中添加一个指向虚函数表的指针。

PS:编译器 GCC 4.6.2

【试题描述】请实现一个函数,把字符串中的每个空格替换成%20。例如输入"We are happy.",则输出"We%20are%20happy."。(不能使用任何字符串操作函数)

【试题来源】未知

【试题分析】由于不能使用字符串操作函数,因此只能一个字符一个字符的进行处理。 首先先遍历一遍字符串,统计得到所有的空格,那么替换后的字符串长度应 为len(str) + numOfBlank * 2。然后从后往前面修改字符串。算法时间复杂度O(n)。

【参考代码】

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
#include <cstdio>
#include <cstring>
using namespace std;

void replaceSpace(char str[], int length) {
if(str == NULL) {
return ;
}

int numOfBlank = 0;
int i = 0;
while(str[i] != '\0') {
if(str[i] == ' ') {
numOfBlank++;
}
i++;
}

int indexOriginalString = length - 1;
int indexNewString = length + numOfBlank * 2 - 1;
while(indexOriginalString >= 0) {
if(str[indexOriginalString] == ' ') {
str[indexNewString--] = '0';
str[indexNewString--] = '2';
str[indexNewString--] = '%';
} else {
str[indexNewString--] = str[indexOriginalString];
}
indexOriginalString--;
}
}

int main() {
char str[] = "We are happy.";
//char str[] = " We are happy. ";
//char str[] = "Wearehappy.";
replaceSpace(str, strlen(str));
printf("%s", str);

return 0;
}

【试题描述】输入二叉树的前序遍历和中序遍历的结果,请重建出该二叉树。假设遍历结果中都不包含重复数字。例如输入前序遍历序列{1,2,4,7,3,5,6,8}和中序遍历{4,7,2,1,5,3,8,6},则重建出该二叉树,并以后续遍历输出。

【参考代码】

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#include <iostream>
using namespace std;

struct BinaryTreeNode
{
int value;
BinaryTreeNode* left;
BinaryTreeNode* right;
};

BinaryTreeNode* rebulidBinaryTreeCore(int* startPreOrder, int* endPreOrder,
int* startInOrder, int* endInOrder) {

//先序遍历的第一个节点为根节点
//创建根节点
BinaryTreeNode* rootNode = new BinaryTreeNode();
rootNode->value = startPreOrder[0];
rootNode->left = NULL;
rootNode->right = NULL;

if(startPreOrder == endPreOrder) {
if(startInOrder == endInOrder
&& *startPreOrder == *startInOrder) {
return rootNode;
} else {
throw ("Invalid input.");
}
}

//在中序查找根节点
int* rootIndex = startInOrder;
while(rootIndex <= endInOrder && *rootIndex != rootNode->value) {
rootIndex++;
}
if(rootIndex == endInOrder && *rootIndex != rootNode->value) {
throw ("Invalid input.");
}

//重建左子树
int leftTreeLength = rootIndex - startInOrder;
if(leftTreeLength > 0) {
rootNode->left = rebulidBinaryTreeCore(startPreOrder + 1,
startPreOrder + leftTreeLength,
startInOrder,
rootIndex - 1);
}

//重建右子树
if(leftTreeLength < endPreOrder - startPreOrder) {
rootNode->right = rebulidBinaryTreeCore(startPreOrder + leftTreeLength + 1,
endPreOrder,
rootIndex + 1,
endInOrder);
}

return rootNode;
}

BinaryTreeNode* rebulidBinaryTree(int* preOrder, int* inOrder, int n) {
if(preOrder == NULL || inOrder == NULL || n <= 0) {
return NULL;
}

return rebulidBinaryTreeCore(preOrder, preOrder + n -1,
inOrder, inOrder + n - 1);
}

void printPostOrder(BinaryTreeNode* root) {
if(root!= NULL) {
if(root->left != NULL) {
printPostOrder(root->left);
}
if(root->right != NULL) {
printPostOrder(root->right);
}
cout << root->value << " ";
}
return ;
}

int main() {
int preOrder[8] = {1,2,4,7,3,5,6,8};
int inOrder[8] = {4,7,2,1,5,3,8,6};
BinaryTreeNode* root = rebulidBinaryTree(preOrder, inOrder, 8);
printPostOrder(root);
return 0;
}

几乎所有的博客都可以在线阅读,或者通过 RSS 订阅源进行阅读。RSS 订阅源是一个包含博客及其所有文章条目信息的简单的 XML 文档。 程序中使用了 feedparser 第三方模块,可以轻松地从任何 RSS 或 Atom 订阅源中得到标题、链接和文章的条目。完整代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
'''
Created on Jul 14, 2012

@Author: killua
@E-mail: [email protected]
@Homepage: http://www.yidooo.net
@Decriptioin: Counting the words in a Feed

feedparser:feedparser is a Python library that parses feeds in all known formats, including Atom, RSS, and RDF.It runs on Python 2.4 all the way up to 3.2.

dataset: http://kiwitobes.com/clusters/feedlist.txt
You can download feeds from this list. Maybe some feeds you can access in China.
'''

import feedparser
import re

#Get word from feed
def getwords(html):
#Remove all the HTML tags
text = re.compile(r"<[^>]+>").sub('', html)

#Split words by all non-alpha characters
words = re.compile(r"[^A-Z^a-z]+").split(text)

#Convert words to lowercase
wordlist = [word.lower() for word in words if word != ""]

return wordlist

#Returns title and dictionary of word counts for an RSS feed
def getFeedwordcounts(url):
#Parser the feed
d = feedparser.parse(url)
wordcounts = {}

#Loop over all the entries
for e in d.entries:
if 'summary' in e:
summary = e.summary
else:
summary = e.description

words = getwords(e.title + ' ' + summary)
for word in words:
wordcounts.setdefault(word, 0)
wordcounts[word] += 1

return d.feed.title, wordcounts

if __name__ == '__main__':
#count the words appeared in blog
blogcount = {}
wordcounts = {}

feedFile = file('resource/feedlist.txt')
feedlist = [line for line in feedFile.readlines()]

for feedUrl in feedlist:
try:
title, wc = getFeedwordcounts(feedUrl)
wordcounts[title] = wc
for word, count in wc.items():
blogcount.setdefault(word, 0)
if count > 1:
blogcount[word] += 1
except:
print 'Failed to parse feed %s' % feedUrl

wordlist = []
for w, bc in blogcount.items():
frac = float(bc) / len(feedlist)
if frac > 0.1 and frac < 0.5:
wordlist.append(w)

#Write the result to the file
datafile = file('blogdata', 'w')
#Write result's head
datafile.write('Blog')
for word in wordlist:
datafile.write('\t%s' % word)
datafile.write('\n')
#Write results
for blogname, wc in wordcounts.items():
print blogname
datafile.write(blogname)
for word in wordlist:
if word in wc:
datafile.write("\t%d" % wc[word])
else:
datafile.write("\t0")
datafile.write('\n')

2012 Microsoft Intern Hiring Written Test

1. Suppose that a Selection Sort of 80 items has completed 32 iterations of the main loop. How many items are now guaranteed to be in their final spot (never to be moved again)?
A) 16 B) 31 C) 32 D) 39 E) 40

2. Which Synchronization mechanism(s) is/are used to avoid race conditions among processes/threads in operating systems?
A) Mutex B) Mailbox C) Semaphore D) Local procedure call

3. There is a sequence of n numbers 1, 2, 3,.., n and a stack which can keep m numbers at most. Push the n numbers into the stack following the sequence and pop out randomly. Suppose n is 2 and m is 3, the output sequence may be 1, 2 or 2, 1, so we get 2 different sequences. Suppose n is 7 and m is 5, please choose the output sequences of the stack:
A) 1, 2, 3, 4, 5, 6, 7
B) 7, 6, 5, 4, 3, 2, 1
C) 5, 6, 4, 3, 7, 2, 1
D) 1, 7, 6, 5, 4, 3, 2
E) 3, 2, 1, 7, 5, 6, 4

4. What is the result of binary number 01011001 after multiplying by 0111001 and adding 1101110?
A) 0001 0100 0011 1111
B) 0101 0111 0111 0011
C) 0011 0100 0011 0101

5. What is output if you compile and execute the following code?

1
2
3
4
5
6
7
void main()
{
int i = 11;
int const *p = &i;
p++;
printf("%d", *p);
}
  1. 11 B) 12 C) Garbage value D) Compile error E) None of above

6. Which of following C++ code is correct?
A) int f() { int a = new int(3); return a; } B) int f() { int a[3] = {1, 2, 3}; return a; } C) vector f() { vector v(3); return v; } D) void f(int ret) { int a[3] = {1, 2, 3}; ret = a; return; }

7. Given that the 180-degree rotated image of a 5-digit number is another 5-digit number and the difference between the numbers is 78633, what is the original 5-digit number?
A) 60918 B) 91086 C) 18609 D) 10968 E) 86901

8. Which of the following statements are true?
A) We can create a binary tree from given inorder and preorder traversal sequences.
B) We can create a binary tree from given preorder and postorder traversal sequences.
C) For an almost sorted array, insertion sort can be more effective than Quicksort.
D) Suppose T(n) is the runtime of resolving a problem with n elements, T(n) = Θ(1) if n = 1; T(n) = 2T(n/2) + Θ(n) if > 1; so T(n) is Θ(n log n).
E) None of the above.

9. Which of the following statements are true?
A) Insertion sort and bubble sort are not effcient for large data sets.
B) Quick sort makes O(n^2) comparisons in the worst case.
C) There is an array: 7, 6, 5, 4, 3, 2, 1. If using selection sort (ascending), the number of swap operation is 6.
D) Heap sort uses two heap operations: insertion and root deletion.
E) None of above.

10. Assume both x and y are integers, which one of the followings returns the minimum of the two integers?
A) y ^ ((x ^ y) & ~(x < y))
B) y ^(x ^ y)
C) x ^ (x ^ y)
D) (x ^ y) ^ (y ^ x)
E) None of the above

11. The Orchid Pavilion (兰亭集序) is well known as the top of "行书" in history of Chinese literature. The most fascinating sentence "Well I know it is a lie to say that life and death is the same thing, and that longevity and early death make no difference! Alas!" ("周知一死生为虚诞,齐彭殇为妄作。") By counting the characters of the whole content (in Chinese version), the result should be 391 (including punctuation). For these characters written to a text file, please select the possible file size without any data corrupt.
A) 782 bytes in UTF-16 encoding
B) 784 bytes in UTF-16 encoding
C) 1173 bytes in UTF-8 encoding D) 1176 bytes in UTF-8 encoding
E) None of the above

12. Fill the blanks inside class definition

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class Test
{
public:
____ int a;
____ int b;
public:
Test::Test(int _a, int _b) : a(_a) {b = _b;}
};
int Test::b;
int _tmain(int argc, __TCHAR *argv[])
{
Test t1(0, 0), t2(1, 1);
t1.b = 10;
t2.b = 20;
printf("%u %u %u %u", t1.a, t1.b, t2.a, t2.b);
}

Running result: 0 20 1 20
A) static/const
B) const/static
C) --/static
D) const static/static
E) None of the above

13. A 3-order B-tree has 2047 key words, what is the maximum height of the tree?
A) 11 B) 12 C) 13 D) 14

14. In C++, which of the following keyword(s) can be used on both a variable and a function?
A) static B) virtual C) extern D) inline E) const

15. What is the result of the following program?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
char* f(char *str, char ch)
{
char *it1 = str;
char *it2 = str;
while (*it2 != '\0') {
while (*it2 == ch) {
it2++;
}
*it1++ = *it2++;
}
return str;
}
void main(int argc, char *argv[])
{
char *a = new char[10];
strcpy(a, "abcdcccd");

cout << f(a, 'c');
}
  1. abdcccd B) abdd C) abcc D) abddcccd E) Access Violation

16. Consider the following definition of a recursive function, power, that will perform exponentiation.

1
2
3
4
5
6
int power(int b, int e)
{
if (e == 0) return 1;
if (e %2 == 0) return power (b * b, e / 2);
return b * power(b * b, e / 2);
}

Asymptotically (渐进地) in terms of the exponent e, the number of calls to power that occur as a result of the call power(b, e) is* A) logarithmic B) linear C) quadratic D) exponential

17. Assume a full deck of cards has 52 cards, 2 black suits (spade and club) and 2 red suits (diamond and heart). If you are given a full deck, and a half deck (with 1 red suit and 1 black suit), what's the possibility for each one getting 2 red cards if taking 2 cards?
A) 1/2, 1/2 B) 25/102, 12/50 C) 50/51, 24/25 D) 25/51, 12/25 E) 25/51, 1/2

18. There is a stack and a sequence of n numbers (i.e., 1, 2, 3, ..., n). Push the n numbers into the stack following the sequence and pop out randomly. How many different sequences of the number we may get? Suppose n is 2, the output sequence may be 1, 2 or 2, 1, so we get 2 different sequences.
A) C_2n^n
B) C_2n^n - C_2n^(n + 1)
C) ((2n)!) / (n + 1)n!n!
D) n!
E) None of the above

19. Longest Increasing Subsequence (LIS) means a sequence containing some elements in another sequence by the same order, and the values of elements keeps increasing. For example, LIS of {2, 1, 4, 2, 3, 7, 4, 6} is {1, 2, 3, 4, 6}, and its LIS length is 5. Considering an array with N elements, what is the lowest time and space complexity to get the length of LIS?
A) Time: N^2, Space: N^2
B) Time: N^2, Space: N
C) Time: NlogN, Space: N
D) Time: N, Space: N
E) Time: N, Space: C

20. What is the output of the following piece of C++ code?

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
#include <iostream>
using namespace std;*
struct Item
{
char c;
Item *next;
};
Item Routine1(Item *x)
{
Item *prev = NULL, *curr = x;
while (curr) {
Item *next = curr->next;
curr->next = prev;
prev = curr;
curr = next;
}
return prev;
}
void Routine2(Item *x)
{
Item *curr = x;
while (curr) {
cout << curr->c << " ";
curr = curr->next;
}
}
void _tmain(void)
{
Item *x,
d = {'d', NULL},
c = {'c', &d},
b = {'b', &c},
a = {'a', &b};*
x = Routine1(&a);
Routine2(x);
}
  1. cbad B) badc C) dbca D) abcd E) dcba

Ubuntu: sudo apt-get install ttf-arphic-uming

Fedora:sudo yum install cjkuni-uming-fonts

然后打开http://web.sanguosha.com/  测试,如果不行请重启电脑。

PS:本人测试后发现安装字体只能解决 Firefox 下面的乱码,但无法解决 Chrome 浏览器下面的问题。

参考资料: http://hi.baidu.com/gerald07/item/fb2234cfbfef9f21e80f2e77 http://hi.baidu.com/siweng23/blog/item/aa9dfcfbd4f4564c242df2ec.html http://blog.csdn.net/devplus/article/details/6347492

协同过滤技术可以分为三类:基于用户(User-based)的协同过滤;基于项目(Item-based)的协同过滤;基于模型(Model-based)的协同过滤。 

基于用户(User-based)的协同过滤

用相似统计的方法得到具有相似爱好或者兴趣的相邻用户,所以称之为以用户为基础(User-based)的协同过滤或基于邻居的协同过滤(Neighbor-based Collaborative Filtering)。方法步骤:

1.收集用户资讯

收集可以代表用户兴趣的资讯。一般的网站系统使用评分的方式或是给予评价,这种方式被称为「主动评分」。另外一种是「被动评分」,是根据用户的行为模式由系统代替用户完成评价,不需要用户直接打分或输入评价资料。电子商务网站在被动评分的资料获取上有其优势,用户购买的商品记录是相当有用的资料。

2.最近邻搜索(Nearest neighbor search, NNS)

以用户为基础(User-based)的协同过滤的出发点是与用户兴趣爱好相同的另一组用户,就是计算两个用户的相似度。例如:寻找n个和A有相似兴趣用户,把他们对M的评分作为A对M的评分预测。一般会根据资料的不同选择不同的演算法,目前较多使用的相似度演算法有Person Correlation Coefficient、Cosine-based Similarity、Adjusted Cosine Similarity。

3.产生推荐结果

有了最近邻集合,就可以对目标用户的兴趣进行预测,产生推荐结果。依据推荐目的的不同进行不同形式的推荐,较常见的推荐结果有Top-N推荐和关联推荐。 Top-N推荐是针对个体用户产生,对每个人产生不一样的结果,例如:透过对A用户的最近邻用户进行统计,选择出现频率高且在A用户的评分项目中不存在的,作为推荐结果。关联推荐是对最近邻用户的记录进行关联规则(association rules)挖掘。

基于项目(Item-based)的协同过滤

以项目为基础的协同过滤方法有一个基本的假设:“能够引起用户兴趣的项目,必定与其之前评分高的项目相似”,透过计算项目之间的相似性来代替用户之间的相似性。方法步骤:

1.收集用户资讯

同以用户为基础(User-based)的协同过滤。

2.针对项目的最近邻搜索

先计算己评价项目和待预测项目的相似度,并以相似度作为权重,加权各已评价项目的分数,得到待预测项目的预测值。例如:要对项目A和项目B进行相似性计算,要先找出同时对A和B打过分的组合,对这些组合进行相似度计算,常用的演算法同以用户为基础(User-based )的协同过滤。

3.产生推荐结果

以项目为基础的协同过滤不用考虑用户间的差别,所以精度比较差。但是却不需要用户的历史资料,或是进行用户识别。对于项目来讲,它们之间的相似性要稳定很多,因此可以离线完成工作量最大的相似性计算步骤,从而降低了线上计算量,提高推荐效率,尤其是在用户多于项目的情形下尤为显著。

基于模型(Model-based)的协同过滤

以用户为基础(User-based)的协同过滤和以项目为基础(Item-based)的协同过滤统称为以记忆为基础(Memory based)的协同过滤技术,他们共有的缺点是资料稀疏,难以处理大资料量影响即时结果,因此发展出以模型为基础的协同过滤技术。以模型为基础的协同过滤(Model-based Collaborative Filtering)是先用历史资料得到一个模型,再用此模型进行预测。以模型为基础的协同过滤广泛使用的技术包括Latent Semantic Indexing、 Bayesian Networks…等,根据对一个样本的分析得到模型。

参考资料:http://gengrenjie.com/2009/04/12/%E5%8D%8F%E5%90%8C%E8%BF%87%E6%BB%A4%E6%89%AB%E7%9B%B2%E7%8F%AD%EF%BC%884%EF%BC%89/

本文构建了一个简单的推荐系统,使用的数据是真实的数据,叫作MovieLens,来自University of Minnesota‘s GroupLens项目组。代码以Python作为实现语言,使用版本为Python2.7。

loadMovieData:用于数据的读取。userData指的是以userId为键构建的电影评分列表。movieData值的是以movieId为键构建的电影评分列表。

euclidean:用于计算Eucidean距离系数

pearson:用于计算Pearson相关系数

getSimilarItems:计算出movieData中每一项相似度最大的n项

getRecommendationsItems:对于某个user取得推荐结果

代码如下:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
'''
Created on Jun 30, 2012

@Author: killua
@E-mail: [email protected]
@Homepage: http://www.yidooo.net

Data set download from : http://www.grouplens.org/system/files/ml-100k.zip

MovieLens data sets were collected by the GroupLens Research Project
at the University of Minnesota.The data was collected through the MovieLens web site
(movielens.umn.edu) during the seven-month period from September 19th,
1997 through April 22nd, 1998.

This data set consists of:
* 100,000 ratings (1-5) from 943 users on 1682 movies.
* Each user has rated at least 20 movies.
* Simple demographic info for the users

u.data -- The full u data set, 100000 ratings by 943 users on 1682 items.
Each user has rated at least 20 movies. Users and items are
numbered consecutively from 1. The data is randomly
ordered. This is a tab separated list of
user id | item id | rating | timestamp.
The time stamps are unix seconds since 1/1/1970 UTC
u.item -- Information about the items (movies); this is a tab separated
list of
movie id | movie title | release date | video release date |
IMDb URL | unknown | Action | Adventure | Animation |
Children's | Comedy | Crime | Documentary | Drama | Fantasy |
Film-Noir | Horror | Musical | Mystery | Romance | Sci-Fi |
Thriller | War | Western |
The last 19 fields are the genres, a 1 indicates the movie
is of that genre, a 0 indicates it is not; movies can be in
several genres at once.
The movie ids are the ones used in the u.data data set.
'''

from math import sqrt

def loadMovieData(path = "./data/"):
"""
Load movie data from u.data and u.item
@param path: Data set path
"""
#Get movie's title
movies = {}
for line in open(path + '/u.item'):
(movieId, movieTitle) = line.split('|')[0:2]
movies[movieId] = movieTitle

#Load Data
movieData = {}
userData = {}
for line in open(path + '/u.data'):
(userId, itemId, rating, timestamp)=line.split('\t')
movieData.setdefault(movies[itemId], {})
movieData[movies[itemId]][userId] = float(rating)
userData.setdefault(userId, {})
userData[userId][movies[movieId]] = float(rating)

return (movieData, userData)

def euclidean(data, p1, p2):
"Calculate Euclidean distance"
distance = sum([pow(data[p1][item]-data[p2][item],2)
for item in data[p1] if item in data[p2]])

return 1.0 / (1 + distance)

def pearson(data, p1, p2):
"Calculate Pearson correlation coefficient"
corrItems = [item for item in data[p1] if item in data[p2]]

n = len(corrItems)
if n == 0:
return 0;

sumX = sum([data[p1][item] for item in corrItems])
sumY = sum([data[p2][item] for item in corrItems])
sumXY = sum([data[p1][item] * data[p2][item] for item in corrItems])
sumXsq = sum([pow(data[p1][item], 2) for item in corrItems])
sumYsq = sum([pow(data[p2][item],2) for item in corrItems])

if sqrt((sumXsq - pow(sumX, 2) / n) * (sumYsq - pow(sumY, 2) / n)) != 0:
pearson = (sumXY - sumX * sumY / n) / sqrt((sumXsq - pow(sumX, 2) / n) * (sumYsq - pow(sumY, 2) / n))
else:
return 0

return pearson

def getSimilarItems(movieData, n = 20, similarity=pearson):
"""
Create a dictionary of items showing which other items they are most similar to.
"""

results = {}
for movie in movieData:
#Get n items which most similar to movie
matches = [(similarity(movieData, movie, otherMovie),otherMovie)
for otherMovie in movieData if movie != otherMovie]
matches.sort()
matches.reverse()
results[movie] = matches[0:n]

return results

def getRecommendationsItems(userData, user, similarItems, n = 10):
"""
Get recommendations items for user
"""
userRatings = userData[user]
itemScores = {}
totalSim = {}

# Loop over items rated by this user
for (item, rating) in userRatings.items():
# Loop over items similar to this one
for (simValue, simItem) in similarItems[item]:
# Ignore if this user has already rated this item
if simItem in userRatings:
continue
# Weighted sum of rating times similarity
itemScores.setdefault(simItem, 0)
itemScores[simItem] += simValue * rating
# Sum of all the similarities
totalSim.setdefault(simItem, 0)
totalSim[simItem] += simValue

# Divide each total score by total weighting to get an average
rankings = [(score / totalSim[item], item) for (item, score) in itemScores.items() if totalSim[item] != 0]
rankings.sort()
rankings.reverse()

return rankings[0:n]

if __name__ == "__main__":

print 'Loading Data...'
movieData, userData = loadMovieData("./movie/")
print 'Get similarItems...'
similarItems = getSimilarItems(movieData, 50, euclidean)
print 'Calculate rankings...'
rankings = getRecommendationsItems(userData, "87", similarItems)

print rankings