์ค์ต ์ค ์๋ฌ 'numpy.ndarray' object has no attribute 'drop'
์๋
ํ์ธ์.์ ์ฒด ์ฝ๋์
๋๋ค.CF_knnimport os import pandas as pd import numpy as np from sklearn.model_selection import train_test_split ## ๋ฐ์ดํฐ ๋ถ๋ฌ์ค๊ธฐ ๋ฐ ํ์ํ ํจ์ ์ ์ base_src = 'drive/MyDrive/SCIT/colab/deep_learning_data' u_user_src = os.path.join(base_src, 'u.user') u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code'] users = pd.read_csv(u_user_src, sep = '|', names = u_cols, encoding = 'latin-1') users = users.set_index('user_id') u_item_src = os.path.join(base_src, 'u.item') i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 'Action', 'Adventure', 'Animation', 'Children\s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western'] movies = pd.read_csv(u_item_src, sep = '|', names = i_cols, encoding = 'latin-1') movies = movies.set_index('movie_id') u_data_src = os.path.join(base_src, 'u.data') r_cols = ['user_id', 'movie_id', 'rating', 'timestamp'] ratings = pd.read_csv(u_data_src, sep = '\t', names = r_cols, encoding = 'latin-1') def RMSE(y_true, y_pred): return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred)) ** 2)) # ์ ์ฌ์ง๋จ์ ํฌ๊ธฐ๋ฅผ ๋ฏธ๋ฆฌ ์ ํ๊ธฐ ์ํด์ ๊ธฐ์กด score ํจ์์ neighbor_size ์ธ์๊ฐ ์ถ๊ฐ def score(model, neighbor_size = 0) : # ํ
์คํธ ๋ฐ์ดํฐ์ user_id์ movie_id๊ฐ pair๋ฅผ ๋ง์ถฐ ํํํ ์์ ๋ฆฌ์คํธ ๋ฐ์ดํฐ๋ฅผ ๋ง๋ฌ id_pairs = zip(x_test['user_id'], x_test['movie_id']) # ๋ชจ๋ ์ฌ์ฉ์-์ํ ์ง์ ๋ํด์ ์ฃผ์ด์ง ์์ธก ๋ชจ๋ธ์ ์ํด ์์ธก๊ฐ ๊ณ์ฐ ๋ฐ ๋ฆฌ์คํธํ ๋ฐ์ดํฐ ์์ฑ y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs]) # ์ค์ ํ์ ๊ฐ y_true = np.array(x_test['rating']) return RMSE(y_true, y_pred) x = ratings.copy() y = ratings['user_id'] x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.25, stratify = y) ratings_matrix = x_train.pivot(index = 'user_id', columns = 'movie_id', values = 'rating') # train set์ ๋ชจ๋ ๊ฐ๋ฅํ ์ฌ์ฉ์ pair์ cosine similarities ๊ณ์ฐ # ์ฝ์ฌ์ธ ์ ์ฌ๋๋ฅผ ๊ณ์ฐํ๋ ์ฌ์ดํท๋ฐ์ ๋ผ์ด๋ธ๋ฌ๋ฆฌ from sklearn.metrics.pairwise import cosine_similarity # ์ฝ์ฌ์ธ ์ ์ฌ๋๋ฅผ ๊ตฌํ๊ธฐ ์ํด rating ๊ฐ์ ๋ณต์ฌํ๊ณ , ๊ณ์ฐ ์ NaN๊ฐ ์๋ฌ ๋๋น๋ฅผ ์ํด ๊ฒฐ์ธก์น๋ฅผ 0์ผ๋ก ๋์ฒด matrix_dummy = ratings_matrix.copy().fillna(0) # ๋ชจ๋ ์ฌ์ฉ์๊ฐ ์ฝ์ฌ์ธ ์ ์ฌ๋ ๊ตฌํ๊ธฐ user_similarity = cosine_similarity(matrix_dummy, matrix_dummy) # ํ์ํ ๊ฐ ์กฐํ๋ฅผ ์ํด ์ธ๋ฑ์ค ๋ฐ ์ปฌ๋ผ๋ช
์ง user_similarity = pd.DataFrame(user_similarity, index = ratings_matrix.index, columns = ratings_matrix.index) # Neighbor size๋ฅผ ์ ํด์ ์์ธก์น๋ฅผ ๊ณ์ฐํ๋ ํจ์ def CF_knn(user_id, movie_id, neighbor_size = 0) : if movie_id in ratings_matrix.columns : sim_scores = user_similarity[user_id].copy() movie_ratings = ratings_matrix[movie_id].copy() none_rating_idx = movie_ratings[movie_ratings.isnull()].index movie_ratings = movie_ratings.dropna() sim_scores = sim_scores.drop(none_rating_idx) if neighbor_size == 0: mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() else : if len(sim_scores) > 1: neighbor_size = min(neighbor_size, len(sim_scores)) sim_scores = np.array(sim_scores) movie_ratings = np.array(movie_ratings) user_idx = np.argsort(sim_scores) sim_scores = sim_scores[user_idx][-neighbor_size:] movie_ratings = movie_ratings[user_idx][-neighbor_size:] mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum() else : mean_rating = 3.0 else : mean_rating = 3.0 return mean_rating # ์ ํ๋ ๊ณ์ฐ score(CF_knn, neighbor_size = 30)์ค์ ์ฌ์ฉ์ ์ถ์ฒ ๊ธฐ๋ฅ์ ์ง๋ฌธ๊ธ์ ์ฌ๋ฆฐ ๋ถ๋ถ์ด ์ ๋ถ์
๋๋ค.