import pandas as pd import numpy as np import functools GENDER = "Gender (1 =F, 0=M)" TOY_STORY = "1: Toy Story (1995)" def dbg(v): print(v) return v def correl(x, y): x = x.dropna() y = y.dropna() xb = x.mean() yb = y.mean() num = sum(map(lambda c: (c[0] - xb) * (c[1] - yb), zip(x, y))) den = np.sqrt(sum(map(lambda x: pow(x - xb, 2), x)) * sum(map(lambda y: pow(y - yb, 2), y))) return num / den X = pd.Series([195, 151, 148, 189, 183, 154]) Y = pd.Series([200, 180, 178, 165, 192, 144]) print(correl(X, Y)) assert np.isclose(correl(X, Y), 0.46706598573232) data = pd.read_csv("HW1-data.csv") gender_data = data.iloc[:, :2].to_dict(orient="list") users = gender_data["User"] gender_data = dict(zip(gender_data["User"], gender_data[GENDER])) print(gender_data) # print(gender_data) movie_data = data.iloc[:, 2:] # movie_data.columns = movie_data.iloc[0] # print(movie_data) print("Response 1: highest average") print(movie_data.mean().sort_values(ascending=False)[:3]) print() print("Response 2: popularity (# of reviews)") print(movie_data.count().sort_values(ascending=False)[:3]) print() print("Response 3: %reviews greater than 4") print(movie_data.apply(lambda s: (s >= 4).sum() / s.count()).sort_values(ascending=False)[:3]) print() print("Response 4: %correlation with Toy Story") toy_story_raters = movie_data.transpose().loc[TOY_STORY] toy_story_raters_n = toy_story_raters.notna() def toy_story_correlate(s): xy = 0 for i, v in s.notna().items(): if toy_story_raters_n[i] and v: xy += 1 x = toy_story_raters.count() return xy/x print(movie_data.apply(toy_story_correlate).sort_values(ascending=False)[:5]) print() print("Response 5: correlation with Toy Story") def toy_story_correlate2(s): return correl(s, toy_story_raters) print(movie_data.apply(toy_story_correlate2).sort_values(ascending=False)) print("TODO NOT DONE") print() print("Response 6: Mean difference by gender") isman = lambda id: not gender_data[id] iswoman = lambda id: gender_data[id] def gender_rating(g, s): v = pd.Series(map(lambda c: c[1], filter(lambda c: gender_data[users[c[0]]] == g, s.items()))) return v.mean() men_ratings = movie_data.apply(functools.partial(gender_rating, 0)) women_ratings = movie_data.apply(functools.partial(gender_rating, 1)) print(men_ratings, women_ratings)