작업형2_모의문제2
#1. 문제탐색 # 회귀모델 csv id,price # target : price # R-Squared, MAE, MSE, RMSE, RMSLE, MAPE # 2. 데이터불러오기 import pandas as pd import numpy as np train = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/main/p2/ab_nyc/train.csv") test = pd.read_csv("https://raw.githubusercontent.com/lovedlim/inf/main/p2/ab_nyc/test.csv") # train = pd.read_csv("train.csv") # test = pd.read_csv("test.csv") #3 데이터 탐색 #print(train.shape, test.shape) #print(train.head(3), test.head(3)) #print(train.isnull().sum(), test.isnull().sum()) #print(train.info(), test.info()) #4. 데이터 전처리 #print(train.shape, test.shape) cols = ['name', 'host_name', 'host_id'] train = train.drop(cols, axis =1 ) test = test.drop(cols, axis =1 ) #print(train.shape, test.shape) train['last_review'] = train['last_review'].mode()[0] test['last_review'] = test['last_review'].mode()[0] train['reviews_per_month'] = train['reviews_per_month'].fillna(0) test['reviews_per_month'] = test['reviews_per_month'].fillna(0) #변수부여 target = train.pop('price') train = train.drop('id', axis=1) id = test.pop('id') # #레이블인코딩 train=pd.get_dummies(train) test=pd.get_dummies(test) from sklearn.model_selection import train_test_split X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size = 0.2, random_state = 2) # #6.모델 학습 및 평가 from sklearn.ensemble import RandomForestRegressor rf = RandomForestRegressor(random_state = 2) rf.fit(X_tr, y_tr) pred = rf.predict(X_val) from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error mae = mean_absolute_error(y_val, pred) mse = mean_squared_error(y_val, pred) mape = mean_absolute_percentage_error(y_val, pred) rmse = np.sqrt(mean_squared_error(y_val, pred)) print(f"mae: {mean_absolute_error(y_val, pred)}") print(f"mse: {mean_squared_error(y_val, pred)}") print(f"mape: {mean_absolute_percentage_error(y_val, pred)}") pred = rf.predict(test) submit = pd.DataFrame({'id' : id, 'price' : pred}) submit.to_csv("0516.csv", index = False) pd.read_csv("0516.csv")말씀하신 대로 원핫인코딩으로 변환했는데 계속 에러가 뜨네요 컬럼 불일치 에러라고 해서 컬럼 비교해봤는데 동일해서요 ㅠㅠㅠ이걸어떻게해결해야할까요!?--------------------------------------------------------------------------- ValueError Traceback (most recent call last) in () 61 print(f"mape: {mean_absolute_percentage_error(y_val, pred)}") 62 ---> 63 pred = rf.predict(test) 64 submit = pd.DataFrame({'id' : id, 'price' : pred}) 65 submit.to_csv("0516.csv", index = False) 3 frames /usr/local/lib/python3.11/dist-packages/sklearn/utils/validation.py in _check_feature_names(estimator, X, reset) 2775 message += "Feature names must be in the same order as they were in fit.\n" 2776 -> 2777 raise ValueError(message) 2778 2779 ValueError: The feature names should match those that were passed during fit. Feature names unseen at fit time: - last_review_2019-07-01 Feature names seen at fit time, yet now missing: - last_review_2019-06-23 - neighbourhood_Arden Heights - neighbourhood_Bay Terrace, Staten Island - neighbourhood_Breezy Point - neighbourhood_Castleton Corners - ...