์์
ํ2_๋ชจ์๋ฌธ์ 2
#1. ๋ฌธ์ ํ์ # ํ๊ท๋ชจ๋ธ csv id,price # target : price # R-Squared, MAE, MSE, RMSE, RMSLE, MAPE # 2. ๋ฐ์ดํฐ๋ถ๋ฌ์ค๊ธฐ import pandas as pd import numpy as np train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/main/p2/ab_nyc/train.csv") test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/main/p2/ab_nyc/test.csv") # train = pd.read_csv("train.csv") # test = pd.read_csv("test.csv") #3 ๋ฐ์ดํฐ ํ์ #print(train.shape, test.shape) #print(train.head(3), test.head(3)) #print(train.isnull().sum(), test.isnull().sum()) #print(train.info(), test.info()) #4. ๋ฐ์ดํฐ ์ ์ฒ๋ฆฌ #print(train.shape, test.shape) cols = ['name', 'host_name', 'host_id'] train = train.drop(cols, axis =1 ) test = test.drop(cols, axis =1 ) #print(train.shape, test.shape) train['last_review'] = train['last_review'].mode()[0] test['last_review'] = test['last_review'].mode()[0] train['reviews_per_month'] = train['reviews_per_month'].fillna(0) test['reviews_per_month'] = test['reviews_per_month'].fillna(0) #๋ณ์๋ถ์ฌ target = train.pop('price') train = train.drop('id', axis=1) id = test.pop('id') # #๋ ์ด๋ธ์ธ์ฝ๋ฉ train=pd.get_dummies(train) test=pd.get_dummies(test) from sklearn.model_selection import train_test_split X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2) # #6.๋ชจ๋ธ ํ์ต ๋ฐ ํ๊ฐ from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state = 2) rf.fit(X_tr, y_tr) pred = rf.predict(X_val) from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error mae = mean_absolute_error(y_val, pred) mse = mean_squared_error(y_val, pred) mape = mean_absolute_percentage_error(y_val, pred) rmse = np.sqrt(mean_squared_error(y_val, pred)) print(f"mae: {mean_absolute_error(y_val, pred)}") print(f"mse: {mean_squared_error(y_val, pred)}") print(f"mape: {mean_absolute_percentage_error(y_val, pred)}") pred = rf.predict(test) submit = pd.DataFrame({'id' : id, 'price' : pred}) submit.to_csv("0516.csv", index = False) pd.read_csv("0516.csv")๋ง์ํ์ ๋๋ก ์ํซ์ธ์ฝ๋ฉ์ผ๋ก ๋ณํํ๋๋ฐ ๊ณ์ ์๋ฌ๊ฐ ๋จ๋ค์ ์ปฌ๋ผ ๋ถ์ผ์น ์๋ฌ๋ผ๊ณ ํด์ ์ปฌ๋ผ ๋น๊ตํด๋ดค๋๋ฐ ๋์ผํด์์ ใ
ใ
ใ
์ด๊ฑธ์ด๋ป๊ฒํด๊ฒฐํด์ผํ ๊น์!?--------------------------------------------------------------------------- ValueError Traceback (most recent call last) in () 61 print(f"mape: {mean_absolute_percentage_error(y_val, pred)}") 62 ---> 63 pred = rf.predict(test) 64 submit = pd.DataFrame({'id' : id, 'price' : pred}) 65 submit.to_csv("0516.csv", index = False) 3 frames /usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py in _check_feature_names(estimator, X, reset) 2775 message += "Feature names must be in the same order as they were in fit.\n" 2776 -> 2777 raise ValueError(message) 2778 2779 ValueError: The feature names should match those that were passed during fit. Feature names unseen at fit time: - last_review_2019-07-01 Feature names seen at fit time, yet now missing: - last_review_2019-06-23 - neighbourhood_Arden Heights - neighbourhood_Bay Terrace, Staten Island - neighbourhood_Breezy Point - neighbourhood_Castleton Corners - ...