에러 이유

Question

안녕하세요! 캐글에 올려주신 자료로 공부 중 에러가 떴는데 어떻게 해결해야할지 몰라 질문 남깁니다! 너무 길어서 보기 어려운 점 미리 사과드립니다..ㅠㅠ import pandas as pd train = pd.read_csv("/kaggle/input/big-data-analytics-certification-kr-2024-3/train.csv") test = pd.read_csv("/kaggle/input/big-data-analytics-certification-kr-2024-3/test.csv") ​ # EDA # print(train.shape, test.shape) #(1168, 81) (292, 80) # print(train.info()) #float64(3), int64(35), object(43) # print(test.info()) #float64(3), int64(34), object(43) # print(train.isnull().sum().sort_values(ascending=False)[:10]) ​ #범주형 데이터가 너무 많아서 수치형만 선택 train=train.select_dtypes(exclude=['object']) test=train.select_dtypes(exclude=['object']) ​ # print(train.head(2)) # print(train.isnull().sum()) #LotFrontage 218, GarageYrBlt 69 # print(test.isnull().sum()) ​ # print(train['LotFrontage'].describe()) # print(train['GarageYrBlt'].describe()) ​ #전처리(결측치, 타겟값 분리) target=train.pop('SalePrice') train=train.drop('Id',axis=1) test_id=test.pop('Id') ​ train['LotFrontage']=train['LotFrontage'].fillna(train['LotFrontage'].mean()) train['GarageYrBlt']=train['GarageYrBlt'].fillna(train['GarageYrBlt'].mean()) train['MasVnrArea']=train['MasVnrArea'].fillna(train['MasVnrArea'].mean()) test['LotFrontage']=test['LotFrontage'].fillna(test['LotFrontage'].mean()) test['GarageYrBlt']=test['GarageYrBlt'].fillna(test['GarageYrBlt'].mean()) test['MasVnrArea']=test['MasVnrArea'].fillna(test['MasVnrArea'].mean()) ​ # print(train.isnull().sum().sum()) # print(test.isnull().sum().sum()) ​ #데이터 분리 from sklearn.model_selection import train_test_split X_tr, X_val, y_tr, y_val = train_test_split(train, target, test_size=0.2, random_state=0) # print(X_tr.shape, X_val.shape, y_tr.shape, y_val.shape) ​ #랜포 from sklearn.ensemble import RandomForestRegressor rf=RandomForestRegressor(random_state=0) rf.fit(X_tr, y_tr) pred=rf.predict(X_val) ​ #평가 from sklearn.metrics import mean_squared_error def rmse(y, y_pred): return mean_squared_error(y, y_pred)**0.5 # print(rmse(y_val, pred)) #중앙값 : 34668.70085343153 #평균 : 33430.8118326734 # 최댓값 : 34100.46200633792 #최솟값 : 34023.36640178194 ​ #예측 pred=rf.predict(test) submit=pd.DataFrame({'Id':test_id, 'SalePrice':pred}) submit.to_csv('0000.csv', index=False) ​ pd.read_csv('0000.csv') --------------------------------------------------------------------------- ValueError Traceback (most recent call last) Cell In[55], line 68 60 return mean_squared_error(y, y_pred)**0.5 61 # print(rmse(y_val, pred)) 62 #중앙값 : 34668.70085343153 63 #평균 : 33430.8118326734 (...) 66 67 #예측 ---> 68 pred=rf.predict(test) 69 submit=pd.DataFrame({'Id':test_id, 'SalePrice':pred}) 70 submit.to_csv('0000.csv', index=False) File /opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:981, in ForestRegressor.predict(self, X) 979 check_is_fitted(self) 980 # Check data --> 981 X = self._validate_X_predict(X) 983 # Assign chunk of trees to jobs 984 n_jobs, _, _ = _partition_estimators(self.n_estimators, self.n_jobs) File /opt/conda/lib/python3.10/site-packages/sklearn/ensemble/_forest.py:602, in BaseForest._validate_X_predict(self, X) 599 """ 600 Validate X whenever one tries to predict, apply, predict_proba.""" 601 check_is_fitted(self) --> 602 X = self._validate_data(X, dtype=DTYPE, accept_sparse="csr", reset=False) 603 if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): 604 raise ValueError("No support for np.int64 index based sparse matrices") File /opt/conda/lib/python3.10/site-packages/sklearn/base.py:548, in BaseEstimator._validate_data(self, X, y, reset, validate_separately, **check_params) 483 def _validate_data( 484 self, 485 X="no_validation", (...) 489 **check_params, 490 ): 491 """Validate input data and set or check the `n_features_in_` attribute. 492 493 Parameters (...) 546 validated. 547 """ --> 548 self._check_feature_names(X, reset=reset) 550 if y is None and self._get_tags()["requires_y"]: 551 raise ValueError( 552 f"This {self.__class__.__name__} estimator " 553 "requires y to be passed, but the target y is None." 554 ) File /opt/conda/lib/python3.10/site-packages/sklearn/base.py:481, in BaseEstimator._check_feature_names(self, X, reset) 476 if not missing_names and not unexpected_names: 477 message += ( 478 "Feature names must be in the same order as they were in fit.
" 479 ) --> 481 raise ValueError(message) ValueError: The feature names should match those that were passed during fit. Feature names unseen at fit time: - SalePric

퇴근후딴짓 · Answer

학습에서는 수치형만 선택해서 학습했는데 test 데이터를 사용할 때는 전체 컬럼을 사용했네요 코드 넣을 때 "코드 블럭"을 먼저 선택하고 코드를 복사-붙여넣기 해주세요 🙂 반대로 하면 보기가 힘들어지네요 하하! 화이팅입니다.