import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 데이터 로드
data = pd.read_csv('titanic.csv')
data.head()
# PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
# 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
# 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
# 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# 이름, 티켓, 객실은 생존 여부에 영향을 미치지 않을 것이라 판단하여 삭제
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
# null 값 확인
data.isnull().any()
# Survived False
# Pclass False
# Sex False
# Age True
# SibSp False
# Parch False
# Fare False
# Embarked True
# dtype: bool
# null 값 처리. Age는 평균값으로, Embarked는 최빈값으로 대체
data.fillna({'Age': data['Age'].mean()}, inplace=True)
data.fillna({'Fare': data['Fare'].mode()[0]}, inplace=True)
# 성별은 0, 1로 변환
sex_mapping = {'male': 0, 'female': 1}
data['Sex'] = data['Sex'].map(sex_mapping)
# 탑승지역은 One-hot encoding Embarked -> Embarked_C, Embarked_Q, Embarked_S
embarked_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked')
data = pd.concat([data, embarked_dummies], axis=1)
# 가족 수를 나타내는 FamilySize 특성 추가
data["FamilySize"] = data["Parch"] + data["SibSp"] + 1
# 필요없는 특성 삭제
data = data.drop(["Embarked", "Parch", "SibSp"], axis=1)
# 특성과 타깃 분리
X = data.drop('Survived', axis=1)
y = data['Survived']
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 모델 학습
model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(X_train, y_train)
# 모델 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Accuracy: 0.80