commit e228e16f222fcf89205e89c3d903e1d53b6712d1 Author: Michael Zhang Date: Mon Sep 6 00:19:37 2021 -0500 hw1 diff --git a/HW1-data.csv b/HW1-data.csv new file mode 100644 index 0000000..866fd59 --- /dev/null +++ b/HW1-data.csv @@ -0,0 +1,21 @@ +User,"Gender (1 =F, 0=M)",260: Star Wars: Episode IV - A New Hope (1977),1210: Star Wars: Episode VI - Return of the Jedi (1983),356: Forrest Gump (1994),"318: Shawshank Redemption, The (1994)","593: Silence of the Lambs, The (1991)",3578: Gladiator (2000),1: Toy Story (1995),2028: Saving Private Ryan (1998),296: Pulp Fiction (1994),1259: Stand by Me (1986),2396: Shakespeare in Love (1998),2916: Total Recall (1990),780: Independence Day (ID4) (1996),541: Blade Runner (1982),1265: Groundhog Day (1993),"2571: Matrix, The (1999)",527: Schindler's List (1993),"2762: Sixth Sense, The (1999)",1198: Raiders of the Lost Ark (1981),34: Babe (1995) +755,0,1,5,2,,4,4,2,2,,3,2,,5,2,,4,2,5,, +5277,0,5,3,,2,4,2,1,,,4,3,2,2,,2,,5,1,3, +1577,1,,,,5,2,,4,,,1,,1,4,4,1,1,2,3,1,3 +4388,0,,3,,,,1,2,3,4,,,4,1,3,5,,5,1,1,2 +1202,1,4,3,4,1,4,1,,4,,1,5,1,,4,,3,5,5,, +3823,1,2,4,4,4,,,3,1,4,4,5,2,4,,1,,,3,,2 +5448,0,,,3,1,1,4,,5,2,,1,,,3,,1,,,5,2 +5347,0,4,,,,3,2,2,,3,,,2,3,2,4,,1,3,5, +4117,1,5,1,,4,2,4,4,4,,1,2,3,1,,5,,,,,5 +2765,0,4,2,,5,3,,4,3,4,,,,2,,,2,5,1,, +5450,1,2,1,5,,,5,5,,,,,3,2,,,1,,2,1,4 +139,0,3,5,2,,2,,2,,1,,3,,3,,2,5,,,,2 +1940,0,2,3,,5,4,,4,5,,,,2,4,,3,,,,5, +3118,1,3,,3,,2,,3,,,4,,1,2,2,3,5,1,,, +4656,1,4,4,,,5,5,2,,3,5,,1,3,,2,,3,,3,1 +4796,1,,,1,,3,2,,2,,1,5,,,,5,2,2,4,3,4 +6037,0,,,,,4,,2,,2,,2,,,,,4,,,, +3048,1,4,5,1,5,1,1,4,,5,,,,,4,,,2,1,2,5 +4790,0,5,1,,,,4,2,1,3,3,3,1,,,,2,,,, +4489,0,1,2,2,4,5,,2,3,2,2,1,,,5,5,4,3,5,3, diff --git a/HW1-data.ods b/HW1-data.ods new file mode 100644 index 0000000..d4509bb Binary files /dev/null and b/HW1-data.ods differ diff --git a/HW1.py b/HW1.py new file mode 100644 index 0000000..1a8d6e5 --- /dev/null +++ b/HW1.py @@ -0,0 +1,77 @@ +import pandas as pd +import numpy as np +import functools + +GENDER = "Gender (1 =F, 0=M)" +TOY_STORY = "1: Toy Story (1995)" + +def dbg(v): + print(v) + return v + +def correl(x, y): + x = x.dropna() + y = y.dropna() + xb = x.mean() + yb = y.mean() + num = sum(map(lambda c: (c[0] - xb) * (c[1] - yb), zip(x, y))) + den = np.sqrt(sum(map(lambda x: pow(x - xb, 2), x)) * + sum(map(lambda y: pow(y - yb, 2), y))) + return num / den +X = pd.Series([195, 151, 148, 189, 183, 154]) +Y = pd.Series([200, 180, 178, 165, 192, 144]) +print(correl(X, Y)) +assert np.isclose(correl(X, Y), 0.46706598573232) + +data = pd.read_csv("HW1-data.csv") +gender_data = data.iloc[:, :2].to_dict(orient="list") +users = gender_data["User"] +gender_data = dict(zip(gender_data["User"], gender_data[GENDER])) +print(gender_data) +# print(gender_data) + +movie_data = data.iloc[:, 2:] +# movie_data.columns = movie_data.iloc[0] +# print(movie_data) + +print("Response 1: highest average") +print(movie_data.mean().sort_values(ascending=False)[:3]) +print() + +print("Response 2: popularity (# of reviews)") +print(movie_data.count().sort_values(ascending=False)[:3]) +print() + +print("Response 3: %reviews greater than 4") +print(movie_data.apply(lambda s: (s >= 4).sum() / s.count()).sort_values(ascending=False)[:3]) +print() + +print("Response 4: %correlation with Toy Story") +toy_story_raters = movie_data.transpose().loc[TOY_STORY] +toy_story_raters_n = toy_story_raters.notna() +def toy_story_correlate(s): + xy = 0 + for i, v in s.notna().items(): + if toy_story_raters_n[i] and v: + xy += 1 + x = toy_story_raters.count() + return xy/x +print(movie_data.apply(toy_story_correlate).sort_values(ascending=False)[:5]) +print() + +print("Response 5: correlation with Toy Story") +def toy_story_correlate2(s): + return correl(s, toy_story_raters) +print(movie_data.apply(toy_story_correlate2).sort_values(ascending=False)) +print("TODO NOT DONE") +print() + +print("Response 6: Mean difference by gender") +isman = lambda id: not gender_data[id] +iswoman = lambda id: gender_data[id] +def gender_rating(g, s): + v = pd.Series(map(lambda c: c[1], filter(lambda c: gender_data[users[c[0]]] == g, s.items()))) + return v.mean() +men_ratings = movie_data.apply(functools.partial(gender_rating, 0)) +women_ratings = movie_data.apply(functools.partial(gender_rating, 1)) +print(men_ratings, women_ratings)