77 lines
2.3 KiB
Python
77 lines
2.3 KiB
Python
import pandas as pd
|
|
import numpy as np
|
|
import functools
|
|
|
|
GENDER = "Gender (1 =F, 0=M)"
|
|
TOY_STORY = "1: Toy Story (1995)"
|
|
|
|
def dbg(v):
|
|
print(v)
|
|
return v
|
|
|
|
def correl(x, y):
|
|
x = x.dropna()
|
|
y = y.dropna()
|
|
xb = x.mean()
|
|
yb = y.mean()
|
|
num = sum(map(lambda c: (c[0] - xb) * (c[1] - yb), zip(x, y)))
|
|
den = np.sqrt(sum(map(lambda x: pow(x - xb, 2), x)) *
|
|
sum(map(lambda y: pow(y - yb, 2), y)))
|
|
return num / den
|
|
X = pd.Series([195, 151, 148, 189, 183, 154])
|
|
Y = pd.Series([200, 180, 178, 165, 192, 144])
|
|
print(correl(X, Y))
|
|
assert np.isclose(correl(X, Y), 0.46706598573232)
|
|
|
|
data = pd.read_csv("HW1-data.csv")
|
|
gender_data = data.iloc[:, :2].to_dict(orient="list")
|
|
users = gender_data["User"]
|
|
gender_data = dict(zip(gender_data["User"], gender_data[GENDER]))
|
|
print(gender_data)
|
|
# print(gender_data)
|
|
|
|
movie_data = data.iloc[:, 2:]
|
|
# movie_data.columns = movie_data.iloc[0]
|
|
# print(movie_data)
|
|
|
|
print("Response 1: highest average")
|
|
print(movie_data.mean().sort_values(ascending=False)[:3])
|
|
print()
|
|
|
|
print("Response 2: popularity (# of reviews)")
|
|
print(movie_data.count().sort_values(ascending=False)[:3])
|
|
print()
|
|
|
|
print("Response 3: %reviews greater than 4")
|
|
print(movie_data.apply(lambda s: (s >= 4).sum() / s.count()).sort_values(ascending=False)[:3])
|
|
print()
|
|
|
|
print("Response 4: %correlation with Toy Story")
|
|
toy_story_raters = movie_data.transpose().loc[TOY_STORY]
|
|
toy_story_raters_n = toy_story_raters.notna()
|
|
def toy_story_correlate(s):
|
|
xy = 0
|
|
for i, v in s.notna().items():
|
|
if toy_story_raters_n[i] and v:
|
|
xy += 1
|
|
x = toy_story_raters.count()
|
|
return xy/x
|
|
print(movie_data.apply(toy_story_correlate).sort_values(ascending=False)[:5])
|
|
print()
|
|
|
|
print("Response 5: correlation with Toy Story")
|
|
def toy_story_correlate2(s):
|
|
return correl(s, toy_story_raters)
|
|
print(movie_data.apply(toy_story_correlate2).sort_values(ascending=False))
|
|
print("TODO NOT DONE")
|
|
print()
|
|
|
|
print("Response 6: Mean difference by gender")
|
|
isman = lambda id: not gender_data[id]
|
|
iswoman = lambda id: gender_data[id]
|
|
def gender_rating(g, s):
|
|
v = pd.Series(map(lambda c: c[1], filter(lambda c: gender_data[users[c[0]]] == g, s.items())))
|
|
return v.mean()
|
|
men_ratings = movie_data.apply(functools.partial(gender_rating, 0))
|
|
women_ratings = movie_data.apply(functools.partial(gender_rating, 1))
|
|
print(men_ratings, women_ratings)
|