머신러닝(Machine Learning) 입문: 타이타닉 생존자 예측 문제로 배우는 ML 기초
김재우
=IF(D2:D="female", 1, 0)=IF(OR(D15="female", AND(B15=1, E15<=20, D15="male")), 1, 0)import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
# 데이터 로드
data = pd.read_csv('titanic.csv')
data.head()
# PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
# 0 1 0 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
# 1 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C
# 2 3 1 3 Heikkinen, Miss. Laina female 26.0 0 0 STON/O2. 3101282 7.9250 NaN S
# 3 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 0 113803 53.1000 C123 S
# 4 5 0 3 Allen, Mr. William Henry male 35.0 0 0 373450 8.0500 NaN S
# 이름, 티켓, 객실은 생존 여부에 영향을 미치지 않을 것이라 판단하여 삭제
data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
# null 값 확인
data.isnull().any()
# Survived False
# Pclass False
# Sex False
# Age True
# SibSp False
# Parch False
# Fare False
# Embarked True
# dtype: bool
# null 값 처리. Age는 평균값으로, Embarked는 최빈값으로 대체
data.fillna({'Age': data['Age'].mean()}, inplace=True)
data.fillna({'Fare': data['Fare'].mode()[0]}, inplace=True)
# 성별은 0, 1로 변환
sex_mapping = {'male': 0, 'female': 1}
data['Sex'] = data['Sex'].map(sex_mapping)
# 탑승지역은 One-hot encoding Embarked -> Embarked_C, Embarked_Q, Embarked_S
embarked_dummies = pd.get_dummies(data['Embarked'], prefix='Embarked')
data = pd.concat([data, embarked_dummies], axis=1)
# 가족 수를 나타내는 FamilySize 특성 추가
data["FamilySize"] = data["Parch"] + data["SibSp"] + 1
# 필요없는 특성 삭제
data = data.drop(["Embarked", "Parch", "SibSp"], axis=1)
# 특성과 타깃 분리
X = data.drop('Survived', axis=1)
y = data['Survived']
# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 모델 학습
model = DecisionTreeClassifier(max_depth=3, random_state=42)
model.fit(X_train, y_train)
# 모델 평가
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.2f}")
# Accuracy: 0.80from sklearn import tree
import matplotlib.pyplot as plt
import pydotplus
from IPython.display import Image, display
dot_data = tree.export_graphviz(model, out_file=None, filled=True, rounded=True,
feature_names=X.columns,
class_names=True)
graph = pydotplus.graph_from_dot_data(dot_data)
display(Image(graph.create_png()))# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
import tensorflow as tf
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
# 모델 정의
model = tf.keras.Sequential([
tf.keras.layers.Dense(64, activation='relu', input_shape=(X_train.shape[1],)),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
# 모델 컴파일
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# 모델 학습
model.fit(X_train, y_train, epochs=50, batch_size=32, validation_split=0.2)
# 모델 평가
y_pred = (model.predict(X_test) > 0.5).astype("int32")
accuracy = accuracy_score(y_test, y_pred)
print(f"TensorFlow Model Accuracy: {accuracy:.2f}")
# 18/18 ━━━━━━━━━━━━━━━━━━━━ 0s 691us/step - accuracy: 0.8118 - loss: 0.4067 - val_accuracy: 0.8462 - val_loss: 0.3918
# 6/6 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
# TensorFlow Model Accuracy: 0.83