T2-1
๊ฐ์ฌ๋ ํด๋น๋ฌธ์ ์๋์ ๊ฐ์ด ์ฝ๋๋ฅผ ์งฐ๊ณ ๊ฒฐ๊ณผ๊ฐ์ ๋์ถํ์ต๋๋ค. ์ด์ ๋ ํ๊ฐ์งํ์ ์์ธก๊ฒฐ๊ณผ๋ฉด 40์ ๋ฐ๋๋ฐ์ ๋ฌธ์ ์์๊น์? ๋งจ์๋ ์ฌ์ง์ด ์ฝ๋์ด๊ณ ๊ทธ ์์๊ฐ๋ค์ด ๊ฒฐ๊ณผ๊ฐ์
๋๋ค0.839453284373725 [2]:0.9876977152899824# ์ํํ๊ฒฝ ์ธํ
(์ฝ๋ ๋ณ๊ฒฝ X) import pandas as pd import numpy as np from sklearn.model_selection import train_test_split def exam_data_load(df, target, id_name="", null_name=""): if id_name == "": df = df.reset_index().rename(columns={"index": "id"}) id_name = 'id' else: id_name = id_name if null_name != "": df[df == null_name] = np.nan X_train, X_test = train_test_split(df, test_size=0.2, random_state=2021) y_train = X_train[[id_name, target]] X_train = X_train.drop(columns=[target]) y_test = X_test[[id_name, target]] X_test = X_test.drop(columns=[target]) return X_train, X_test, y_train, y_test df = pd.read_csv("../input/titanic/train.csv") X_train, X_test, y_train, y_test = exam_data_load(df, target='Survived', id_name='PassengerId') X_train.shape, X_test.shape, y_train.shape, y_test.shape import pandas as pd cols = ['Name', 'Cabin', 'Ticket'] for col in cols: X_train = X_train.drop(col, axis =1) X_test = X_test.drop(col, axis = 1) X_train['Age'] = X_train['Age'].fillna(X_train['Age'].median()) X_test['Age'] = X_test['Age'].fillna(X_test['Age'].median()) X_train['Embarked'] = X_train['Embarked'].fillna('S') X_test['Embarked'] = X_test['Embarked'].fillna('S') #print(df.isnull().sum()) #print(X_test.isnull().sum()) #print(df.head()) #print(X_test) cols = ['Sex','Embarked'] from sklearn.preprocessing import LabelEncoder for col in cols: le = LabelEncoder() X_train[col] = le.fit_transform(X_train[col]) X_test[col] = le.transform(X_test[col]) df = pd.concat([X_train,y_train['Survived']], axis = 1) from sklearn.metrics import roc_auc_score from sklearn.model_selection import train_test_split X_train = X_train.drop('PassengerId', axis = 1) X_test_id = X_test.pop('PassengerId') X_tr, X_val, y_tr, y_val = train_test_split(X_train, y_train['Survived'], test_size = 0.2, random_state = 2023) from sklearn.ensemble import RandomForestClassifier model = RandomForestClassifier() model.fit(X_tr, y_tr) pred = model.predict(X_val) print(roc_auc_score(y_val, pred)) pred = model.predict(X_test) submit = pd.DataFrame({'PassengerId': X_test_id, 'Survived': pred }) submit.to_csv('0000.csv', index = False) pd.read_csv('0000.csv') model.score(X_tr, y_tr)