This commit is contained in:
Michael Zhang 2021-09-06 00:19:37 -05:00
commit e228e16f22
Signed by: michael
GPG key ID: BDA47A31A3C8EE6B
3 changed files with 98 additions and 0 deletions

21
HW1-data.csv Normal file
View file

@ -0,0 +1,21 @@
User,"Gender (1 =F, 0=M)",260: Star Wars: Episode IV - A New Hope (1977),1210: Star Wars: Episode VI - Return of the Jedi (1983),356: Forrest Gump (1994),"318: Shawshank Redemption, The (1994)","593: Silence of the Lambs, The (1991)",3578: Gladiator (2000),1: Toy Story (1995),2028: Saving Private Ryan (1998),296: Pulp Fiction (1994),1259: Stand by Me (1986),2396: Shakespeare in Love (1998),2916: Total Recall (1990),780: Independence Day (ID4) (1996),541: Blade Runner (1982),1265: Groundhog Day (1993),"2571: Matrix, The (1999)",527: Schindler's List (1993),"2762: Sixth Sense, The (1999)",1198: Raiders of the Lost Ark (1981),34: Babe (1995)
755,0,1,5,2,,4,4,2,2,,3,2,,5,2,,4,2,5,,
5277,0,5,3,,2,4,2,1,,,4,3,2,2,,2,,5,1,3,
1577,1,,,,5,2,,4,,,1,,1,4,4,1,1,2,3,1,3
4388,0,,3,,,,1,2,3,4,,,4,1,3,5,,5,1,1,2
1202,1,4,3,4,1,4,1,,4,,1,5,1,,4,,3,5,5,,
3823,1,2,4,4,4,,,3,1,4,4,5,2,4,,1,,,3,,2
5448,0,,,3,1,1,4,,5,2,,1,,,3,,1,,,5,2
5347,0,4,,,,3,2,2,,3,,,2,3,2,4,,1,3,5,
4117,1,5,1,,4,2,4,4,4,,1,2,3,1,,5,,,,,5
2765,0,4,2,,5,3,,4,3,4,,,,2,,,2,5,1,,
5450,1,2,1,5,,,5,5,,,,,3,2,,,1,,2,1,4
139,0,3,5,2,,2,,2,,1,,3,,3,,2,5,,,,2
1940,0,2,3,,5,4,,4,5,,,,2,4,,3,,,,5,
3118,1,3,,3,,2,,3,,,4,,1,2,2,3,5,1,,,
4656,1,4,4,,,5,5,2,,3,5,,1,3,,2,,3,,3,1
4796,1,,,1,,3,2,,2,,1,5,,,,5,2,2,4,3,4
6037,0,,,,,4,,2,,2,,2,,,,,4,,,,
3048,1,4,5,1,5,1,1,4,,5,,,,,4,,,2,1,2,5
4790,0,5,1,,,,4,2,1,3,3,3,1,,,,2,,,,
4489,0,1,2,2,4,5,,2,3,2,2,1,,,5,5,4,3,5,3,
1 User Gender (1 =F, 0=M) 260: Star Wars: Episode IV - A New Hope (1977) 1210: Star Wars: Episode VI - Return of the Jedi (1983) 356: Forrest Gump (1994) 318: Shawshank Redemption, The (1994) 593: Silence of the Lambs, The (1991) 3578: Gladiator (2000) 1: Toy Story (1995) 2028: Saving Private Ryan (1998) 296: Pulp Fiction (1994) 1259: Stand by Me (1986) 2396: Shakespeare in Love (1998) 2916: Total Recall (1990) 780: Independence Day (ID4) (1996) 541: Blade Runner (1982) 1265: Groundhog Day (1993) 2571: Matrix, The (1999) 527: Schindler's List (1993) 2762: Sixth Sense, The (1999) 1198: Raiders of the Lost Ark (1981) 34: Babe (1995)
2 755 0 1 5 2 4 4 2 2 3 2 5 2 4 2 5
3 5277 0 5 3 2 4 2 1 4 3 2 2 2 5 1 3
4 1577 1 5 2 4 1 1 4 4 1 1 2 3 1 3
5 4388 0 3 1 2 3 4 4 1 3 5 5 1 1 2
6 1202 1 4 3 4 1 4 1 4 1 5 1 4 3 5 5
7 3823 1 2 4 4 4 3 1 4 4 5 2 4 1 3 2
8 5448 0 3 1 1 4 5 2 1 3 1 5 2
9 5347 0 4 3 2 2 3 2 3 2 4 1 3 5
10 4117 1 5 1 4 2 4 4 4 1 2 3 1 5 5
11 2765 0 4 2 5 3 4 3 4 2 2 5 1
12 5450 1 2 1 5 5 5 3 2 1 2 1 4
13 139 0 3 5 2 2 2 1 3 3 2 5 2
14 1940 0 2 3 5 4 4 5 2 4 3 5
15 3118 1 3 3 2 3 4 1 2 2 3 5 1
16 4656 1 4 4 5 5 2 3 5 1 3 2 3 3 1
17 4796 1 1 3 2 2 1 5 5 2 2 4 3 4
18 6037 0 4 2 2 2 4
19 3048 1 4 5 1 5 1 1 4 5 4 2 1 2 5
20 4790 0 5 1 4 2 1 3 3 3 1 2
21 4489 0 1 2 2 4 5 2 3 2 2 1 5 5 4 3 5 3

BIN
HW1-data.ods Normal file

Binary file not shown.

77
HW1.py Normal file
View file

@ -0,0 +1,77 @@
import pandas as pd
import numpy as np
import functools
GENDER = "Gender (1 =F, 0=M)"
TOY_STORY = "1: Toy Story (1995)"
def dbg(v):
print(v)
return v
def correl(x, y):
x = x.dropna()
y = y.dropna()
xb = x.mean()
yb = y.mean()
num = sum(map(lambda c: (c[0] - xb) * (c[1] - yb), zip(x, y)))
den = np.sqrt(sum(map(lambda x: pow(x - xb, 2), x)) *
sum(map(lambda y: pow(y - yb, 2), y)))
return num / den
X = pd.Series([195, 151, 148, 189, 183, 154])
Y = pd.Series([200, 180, 178, 165, 192, 144])
print(correl(X, Y))
assert np.isclose(correl(X, Y), 0.46706598573232)
data = pd.read_csv("HW1-data.csv")
gender_data = data.iloc[:, :2].to_dict(orient="list")
users = gender_data["User"]
gender_data = dict(zip(gender_data["User"], gender_data[GENDER]))
print(gender_data)
# print(gender_data)
movie_data = data.iloc[:, 2:]
# movie_data.columns = movie_data.iloc[0]
# print(movie_data)
print("Response 1: highest average")
print(movie_data.mean().sort_values(ascending=False)[:3])
print()
print("Response 2: popularity (# of reviews)")
print(movie_data.count().sort_values(ascending=False)[:3])
print()
print("Response 3: %reviews greater than 4")
print(movie_data.apply(lambda s: (s >= 4).sum() / s.count()).sort_values(ascending=False)[:3])
print()
print("Response 4: %correlation with Toy Story")
toy_story_raters = movie_data.transpose().loc[TOY_STORY]
toy_story_raters_n = toy_story_raters.notna()
def toy_story_correlate(s):
xy = 0
for i, v in s.notna().items():
if toy_story_raters_n[i] and v:
xy += 1
x = toy_story_raters.count()
return xy/x
print(movie_data.apply(toy_story_correlate).sort_values(ascending=False)[:5])
print()
print("Response 5: correlation with Toy Story")
def toy_story_correlate2(s):
return correl(s, toy_story_raters)
print(movie_data.apply(toy_story_correlate2).sort_values(ascending=False))
print("TODO NOT DONE")
print()
print("Response 6: Mean difference by gender")
isman = lambda id: not gender_data[id]
iswoman = lambda id: gender_data[id]
def gender_rating(g, s):
v = pd.Series(map(lambda c: c[1], filter(lambda c: gender_data[users[c[0]]] == g, s.items())))
return v.mean()
men_ratings = movie_data.apply(functools.partial(gender_rating, 0))
women_ratings = movie_data.apply(functools.partial(gender_rating, 1))
print(men_ratings, women_ratings)