ML_HYU_20240501
대학원생을 위한 차세대 연구 및 데이터 분석 - 한양대학교
커맨드스페이스
K-means 알고리즘의 결과를 시각화하기 위해 PCA(Principal Component Analysis)를 사용하여 2차원으로 차원 축소한 후, 산점도로 군집화 결과를 나타내는 코드
from sklearn.decomposition import PCA # ... (이전 코드) # 차원 축소 (PCA) pca = PCA(n_components=2, random_state=42) X_pca = pca.fit_transform(X_scaled) # 군집 결과 시각화 plt.figure(figsize=(10, 8)) scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], c=cluster_labels, cmap='viridis') plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=200, marker='X', c='red', label='Cluster Centers') plt.xlabel('PCA 1') plt.ylabel('PCA 2') plt.title('K-means Clustering Result') plt.colorbar(scatter) plt.legend() plt.show()
Reaction
Comment
Share
커맨드스페이스
군집 분석과 클러스터링을 위한 K-means 알고리즘
from sklearn.cluster import KMeans from sklearn.preprocessing import StandardScaler from sklearn.metrics import silhouette_score # 군집 분석을 위한 변수 선택 X = df[['organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']] # 데이터 표준화 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # 최적의 군집 수 찾기 silhouette_scores = [] for n_clusters in range(2, 11): kmeans = KMeans(n_clusters=n_clusters, random_state=42) cluster_labels = kmeans.fit_predict(X_scaled) silhouette_scores.append(silhouette_score(X_scaled, cluster_labels)) optimal_n_clusters = silhouette_scores.index(max(silhouette_scores)) + 2 print(f"최적의 군집 수: {optimal_n_clusters}") # K-means 군집화 수행 kmeans = KMeans(n_clusters=optimal_n_clusters, random_state=42) cluster_labels = kmeans.fit_predict(X_scaled) # 군집 결과 출력 df['Cluster'] = cluster_labels print("\n군집 분석 결과:") print(df[['Cluster', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']])
Reaction
Comment
Share
커맨드스페이스
로지스틱 회귀 계수
import numpy as np # 회귀 계수 확인 coef_df = pd.DataFrame({'Variable': X.columns, 'Coefficient': lr_model.coef_[0]}) print("로지스틱 회귀 계수:") print(coef_df) # Odds Ratio 계산 odds_ratio = np.exp(lr_model.coef_[0]) coef_df['Odds Ratio'] = odds_ratio print("\nOdds Ratio:") print(coef_df) # 편미분 계산 X_mean = X.mean().values.reshape(1, -1) partial_derivative = lr_model.coef_[0] * X_mean * (1 - lr_model.predict_proba(X_mean)[:, 1]) coef_df['Partial Derivative'] = partial_derivative[0] print("\n편미분:") print(coef_df) from sklearn.preprocessing import StandardScaler # 입력 변수 표준화 scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # 표준화된 입력 변수로 모델 학습 lr_model_std = LogisticRegression(random_state=42) lr_model_std.fit(X_scaled, y) # 표준화 회귀 계수 확인 coef_df['Standardized Coefficient'] = lr_model_std.coef_[0] print("\n표준화 회귀 계수:") print(coef_df)
Reaction
Comment
Share
커맨드스페이스
ROC 커브, 컨퓨전 매트릭스
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc from matplotlib import pyplot as plt # ... (이전 코드) # 모델 평가 y_pred = lr_model.predict(X_test) y_pred_proba = lr_model.predict_proba(X_test)[:, 1] # 양성 클래스의 예측 확률 # 혼동 행렬과 분류 보고서 출력 print("\n교육 훈련 효과 분석 (로지스틱 회귀):") print("혼동 행렬:") print(confusion_matrix(y_test, y_pred)) print("\n분류 보고서:") print(classification_report(y_test, y_pred)) # ROC 곡선 그리기 fpr, tpr, _ = roc_curve(y_test, y_pred_proba) roc_auc = auc(fpr, tpr) plt.figure(figsize=(8, 6)) plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (AUC = {roc_auc:.2f})') plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--') plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('Receiver Operating Characteristic (ROC)') plt.legend(loc="lower right") plt.show()
Reaction
Comment
Share
커맨드스페이스
머신러닝 평가
Reaction
Comment
Share
커맨드스페이스
로지스틱 회귀
교육 훈련 관련 변수들을 사용하여 직무 만족도가 높은 그룹(1)과 낮은 그룹(0)을
분류
from sklearn.linear_model import LogisticRegression from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score # 교육 훈련 효과 분석을 위한 변수 선택 X = df[['training_avg', 'informal_learning_avg', 'self_directed_learning_avg']] y = df['job_satisfaction_avg'].apply(lambda x: 1 if x >= 3.5 else 0) # 직무 만족도가 3.5 이상이면 1, 그렇지 않으면 0 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 로지스틱 회귀 모델 학습 lr_model = LogisticRegression(random_state=42) lr_model.fit(X_train, y_train) # 모델 평가 y_pred = lr_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred) recall = recall_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) print("교육 훈련 효과 분석 (로지스틱 회귀):") print(f"정확도: {accuracy:.3f}") print(f"정밀도: {precision:.3f}") print(f"재현율: {recall:.3f}") print(f"F1 점수: {f1:.3f}")
Reaction
Comment
Share
커맨드스페이스
Permutation Importance
from sklearn.inspection import permutation_importance # 랜덤 포레스트 모델 학습 rf_model = RandomForestRegressor(n_estimators=100, random_state=42) rf_model.fit(X, y) # 특징 중요도 계산 importance = permutation_importance(rf_model, X, y, n_repeats=10, random_state=42) # 특징 중요도 플롯 시각화 fig, ax = plt.subplots(figsize=(10, 8)) sorted_idx = importance.importances_mean.argsort() ax.boxplot(importance.importances[sorted_idx].T, vert=False, labels=X.columns[sorted_idx]) ax.set_title("Permutation Importance (Random Forest)") fig.tight_layout() plt.show()
Reaction
Comment
Share
커맨드스페이스
지금까지 정리된 코드
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score from sklearn.inspection import permutation_importance, PartialDependenceDisplay # x, y 변수 선택 부분 x_columns = ['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg'] y_column = 'job_satisfaction_avg' # 데이터 로드 및 전처리 X = df[x_columns] y = df[y_column] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 모델 학습 및 예측 models = [ ('Linear Regression', LinearRegression()), ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)), ('Gradient Boosting', GradientBoostingRegressor(n_estimators=100, random_state=42)) ] results = [] for name, model in models: model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) results.append([name, mse, mae, r2]) # 결과 출력 result_df = pd.DataFrame(results, columns=['Model', 'MSE', 'MAE', 'R-squared']) print("Model Comparison:") print(result_df) # 상관 분석 corr_matrix = df[x_columns + [y_column]].corr() plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5) plt.title('Correlation Matrix') plt.show() # 회귀 계수 lr_model = LinearRegression() lr_model.fit(X, y) coef_df = pd.DataFrame({'Variable': X.columns, 'Coefficient': lr_model.coef_}) print("\nRegression Coefficients:") print(coef_df) # 특징 중요도 rf_model = RandomForestRegressor(n_estimators=100, random_state=42) rf_model.fit(X, y) importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': rf_model.feature_importances_}) importance_df = importance_df.sort_values('Importance', ascending=False) print("\nFeature Importance:") print(importance_df) # 부분 의존도 플롯 gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42) gb_model.fit(X, y) display = PartialDependenceDisplay.from_estimator(gb_model, X, X.columns) display.figure_.set_size_inches(10, 8) display.figure_.suptitle('Partial Dependence Plot') display.figure_.subplots_adjust(hspace=0.5) plt.show()
Reaction
Comment
Share
커맨드스페이스
부분 의존도 플롯 - 그래디언트 부스팅 버전
from sklearn.ensemble import GradientBoostingRegressor from sklearn.inspection import PartialDependenceDisplay # 입력 변수와 직무 만족도 데이터 X = df[['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']] y = df['job_satisfaction_avg'] # 그래디언트 부스팅 모델 학습 gb_model = GradientBoostingRegressor(n_estimators=100, random_state=42) gb_model.fit(X, y) # 부분 의존도 플롯 시각화 features = ['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg'] display = PartialDependenceDisplay.from_estimator(gb_model, X, features) display.figure_.set_size_inches(10, 8) # 플롯 크기 조정 display.figure_.suptitle('Partial Dependence Plot') display.figure_.subplots_adjust(hspace=0.5) plt.show()
Reaction
Comment
Share
커맨드스페이스
부분 의존도 플롯 (Partial Dependence Plot)
from sklearn.inspection import PartialDependenceDisplay # 랜덤 포레스트 모델 학습 rf_model = RandomForestRegressor(n_estimators=100, random_state=42) rf_model.fit(X, y) # 부분 의존도 플롯 시각화 fig, ax = plt.subplots(figsize=(10, 8)) display = PartialDependenceDisplay.from_estimator(rf_model, X, features=X.columns, ax=ax) fig.suptitle('Partial Dependence Plot (Random Forest)') plt.subplots_adjust(hspace=0.5) plt.show()
Reaction
Comment
Share
커맨드스페이스
특징 중요도 (Feature Importance)
from sklearn.ensemble import RandomForestRegressor # 입력 변수와 직무 만족도 데이터 X = df[['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']] y = df['job_satisfaction_avg'] # 랜덤 포레스트 모델 학습 rf_model = RandomForestRegressor(n_estimators=100, random_state=42) rf_model.fit(X, y) # 특징 중요도 출력 importance_df = pd.DataFrame({'Variable': X.columns, 'Importance': rf_model.feature_importances_}) importance_df = importance_df.sort_values('Importance', ascending=False) print(importance_df)
Reaction
Comment
Share
커맨드스페이스
회귀 계수
from sklearn.linear_model import LinearRegression # 입력 변수와 직무 만족도 데이터 X = df[['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']] y = df['job_satisfaction_avg'] # 선형 회귀 모델 학습 lr_model = LinearRegression() lr_model.fit(X, y) # 회귀 계수 출력 coef_df = pd.DataFrame({'Variable': X.columns, 'Coefficient': lr_model.coef_}) print(coef_df)
Reaction
Comment
Share
커맨드스페이스
상관 분석 (Correlation Analysis)
import pandas as pd import matplotlib.pyplot as plt import seaborn as sns # 입력 변수와 직무 만족도 간의 상관 계수 계산 corr_matrix = df[['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg', 'job_satisfaction_avg']].corr() # 상관 계수 히트맵 시각화 plt.figure(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5) plt.title('Correlation Matrix') plt.show()
Reaction
Comment
Share
커맨드스페이스
모델 비교
import pandas as pd import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score # 데이터 로드 및 전처리 X = df[['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']] y = df['job_satisfaction_avg'] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # 모델 학습 및 예측 models = [ ('Linear Regression', LinearRegression()), ('Random Forest', RandomForestRegressor(n_estimators=100, random_state=42)), ('Gradient Boosting', GradientBoostingRegressor(n_estimators=100, random_state=42)) ] results = [] for name, model in models: model.fit(X_train, y_train) y_pred = model.predict(X_test) mse = mean_squared_error(y_test, y_pred) mae = mean_absolute_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) results.append([name, mse, mae, r2]) # 결과 출력 result_df = pd.DataFrame(results, columns=['Model', 'MSE', 'MAE', 'R-squared']) print(result_df) # 시각화 fig, ax = plt.subplots(figsize=(10, 6)) x = result_df['Model'] y1 = result_df['MSE'] y2 = result_df['MAE'] y3 = result_df['R-squared'] ax.bar(x, y1, width=0.2, align='center', label='MSE', alpha=0.7) ax.bar([i+0.2 for i in range(len(x))], y2, width=0.2, align='center', label='MAE', alpha=0.7) ax.bar([i+0.4 for i in range(len(x))], y3, width=0.2, align='center', label='R-squared', alpha=0.7) ax.set_xticks([i+0.2 for i in range(len(x))]) ax.set_xticklabels(x) ax.set_ylabel('Score') ax.set_title('Model Comparison') ax.legend() plt.tight_layout() plt.show()
Reaction
Comment
Share
커맨드스페이스
그래디언트 부스팅 모델 평가, 시각화
import matplotlib.pyplot as plt from sklearn.metrics import mean_absolute_error # ... (그래디언트 부스팅 모델 학습 및 예측 코드) # 모델 평가 mse_gb = mean_squared_error(y_test, y_pred_gb) mae_gb = mean_absolute_error(y_test, y_pred_gb) r2_gb = r2_score(y_test, y_pred_gb) print("Gradient Boosting:") print("Mean Squared Error:", mse_gb) print("Mean Absolute Error:", mae_gb) print("R-squared:", r2_gb) # 시각화 plt.figure(figsize=(8, 6)) plt.scatter(y_test, y_pred_gb) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Gradient Boosting: Actual vs. Predicted') plt.show()
Reaction
Comment
Share
커맨드스페이스
랜덤 포레스트 모델 평가, 시각화
from sklearn.metrics import mean_absolute_error # ... (랜덤 포레스트 모델 학습 및 예측 코드) # 모델 평가 mse_rf = mean_squared_error(y_test, y_pred_rf) mae_rf = mean_absolute_error(y_test, y_pred_rf) r2_rf = r2_score(y_test, y_pred_rf) print("Random Forest:") print("Mean Squared Error:", mse_rf) print("Mean Absolute Error:", mae_rf) print("R-squared:", r2_rf) # 시각화 plt.figure(figsize=(8, 6)) plt.scatter(y_test, y_pred_rf) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Random Forest: Actual vs. Predicted') plt.show()
Reaction
Comment
Share
커맨드스페이스
선형 회귀 모델 평가, 시각화
import matplotlib.pyplot as plt from sklearn.metrics import mean_absolute_error # ... (선형 회귀 모델 학습 및 예측 코드) # 모델 평가 mse_lr = mean_squared_error(y_test, y_pred_lr) mae_lr = mean_absolute_error(y_test, y_pred_lr) r2_lr = r2_score(y_test, y_pred_lr) print("Linear Regression:") print("Mean Squared Error:", mse_lr) print("Mean Absolute Error:", mae_lr) print("R-squared:", r2_lr) # 시각화 plt.figure(figsize=(8, 6)) plt.scatter(y_test, y_pred_lr) plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'k--', lw=2) plt.xlabel('Actual Values') plt.ylabel('Predicted Values') plt.title('Linear Regression: Actual vs. Predicted') plt.show()
Reaction
Comment
Share
커맨드스페이스
그래디언트 부스팅
# 그래디언트 부스팅 모델 학습 gb = GradientBoostingRegressor(n_estimators=100, random_state=42) gb.fit(X_train, y_train) # 테스트 데이터 예측 y_pred_gb = gb.predict(X_test) # 모델 평가 mse_gb = mean_squared_error(y_test, y_pred_gb) r2_gb = r2_score(y_test, y_pred_gb) print("Gradient Boosting:") print("Mean Squared Error:", mse_gb) print("R-squared:", r2_gb)
Reaction
Comment
Share
커맨드스페이스
랜덤 포레스트
# 랜덤 포레스트 모델 학습 rf = RandomForestRegressor(n_estimators=100, random_state=42) rf.fit(X_train, y_train) # 테스트 데이터 예측 y_pred_rf = rf.predict(X_test) # 모델 평가 mse_rf = mean_squared_error(y_test, y_pred_rf) r2_rf = r2_score(y_test, y_pred_rf) print("Random Forest:") print("Mean Squared Error:", mse_rf) print("R-squared:", r2_rf)
Reaction
Comment
Share
커맨드스페이스
선형 회귀
# 선형 회귀 모델 학습 lr = LinearRegression() lr.fit(X_train, y_train) # 테스트 데이터 예측 y_pred_lr = lr.predict(X_test) # 모델 평가 mse_lr = mean_squared_error(y_test, y_pred_lr) r2_lr = r2_score(y_test, y_pred_lr) print("Linear Regression:") print("Mean Squared Error:", mse_lr) print("R-squared:", r2_lr)
Reaction
Comment
Share
커맨드스페이스
데이터 전처리 부분
from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.ensemble import GradientBoostingRegressor from sklearn.metrics import mean_squared_error, r2_score # 특징 변수와 타겟 변수 분리 X = df[['hr_role_avg', 'hr_trust_avg', 'training_avg', 'informal_learning_avg', 'self_directed_learning_avg', 'organizational_characteristics_avg', 'management_emphasis_avg', 'organizational_communication_avg', 'organizational_culture_avg']] y = df['job_satisfaction_avg'] # 학습 데이터와 테스트 데이터 분리 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
Reaction
Comment
Share
커맨드스페이스
구글 드라이브에 저장
# 구글 드라이브의 Data 폴더에 새로운 엑셀 파일로 저장 df.to_excel('/content/drive/My Drive/Data/ChatGPT_Sample_Data_Encoded_0518_processed.xlsx', index=False)
Reaction
Comment
Share
커맨드스페이스
변수 정리
import pandas as pd # 엑셀 파일 읽어오기 # df = pd.read_excel('ChatGPT_Sample_Data_Encoded_0518.xlsx') # 변수명 변경 df = df.rename(columns={ 'ID': 'worker_id', 'WC1Q02_02': 'company_worker_id', 'WC1Q03_01': 'industry_classification', 'WC1Q04': 'year_of_joining', 'WC1Q05_01': 'current_position', 'WC1Q06_01': 'contribution_to_strategy', 'WC1Q06_02': 'influence_on_decision_making', 'WC1Q06_03': 'role_in_hr_improvement', 'WC1Q06_04': 'leading_change_and_innovation', 'WC1Q06_05': 'educating_managers', 'WC1Q06_06': 'explaining_hr_policies', 'WC1Q06_07': 'trusted_by_employees', 'WC1Q06_08': 'expertise_in_field', 'WC1Q14_01': 'sufficient_training', 'WC1Q14_02': 'equal_training_opportunities', 'WC1Q14_03': 'desire_for_training', 'WC1Q14_04': 'job_relevance_of_training', 'WC1Q14_05': 'applicability_of_training', 'WC1Q14_06': 'versatility_of_training', 'WC1Q15_01': 'peer_coaching_mentoring', 'WC1Q15_02': 'supervisor_coaching_mentoring', 'WC1Q15_03': 'knowledge_sharing_among_employees', 'WC1Q15_04': 'knowledge_sharing_through_platform', 'WC1Q15_05': 'learning_through_job_rotation', 'WC1Q16_01': 'support_for_academy_tuition', 'WC1Q16_02': 'support_for_domestic_university_tuition', 'WC1Q16_03': 'support_for_domestic_graduate_school_tuition', 'WC1Q16_04': 'support_for_overseas_graduate_degree', 'WC1Q22_01': 'active_problem_solving', 'WC1Q22_02': 'diverse_experience_opportunities', 'WC1Q22_03': 'autonomy_in_work', 'WC1Q22_04': 'cooperation_and_trust', 'WC1Q22_05': 'diverse_competencies_and_experience', 'WC1Q22_06': 'good_communication', 'WC1Q23_01': 'preferential_treatment_of_talent', 'WC1Q23_02': 'clear_vision_for_hr_development', 'WC1Q23_03': 'emphasis_on_talent', 'WC1Q23_04': 'diverse_training_methods', 'WC1Q24_01': 'informing_employees', 'WC1Q24_02': 'freedom_to_express_opinions', 'WC1Q24_03': 'interdepartmental_communication', 'WC1Q24_04': 'trust_among_colleagues', 'WC1Q24_05': 'fair_evaluation_and_compensation', 'WC1Q24_06': 'trustworthy_management', 'WC1Q25_01': 'encouraging_change_and_innovation', 'WC1Q25_02': 'rewarding_innovation', 'WC1Q25_03': 'favoring_creativity_over_diligence', 'WC1Q25_04': 'family_like_atmosphere', 'WC1Q25_05': 'emphasis_on_unity', 'WC1Q25_06': 'emphasis_on_teamwork', 'WC1Q25_07': 'emphasis_on_formal_procedures', 'WC1Q25_08': 'top_down_communication', 'WC1Q25_09': 'emphasis_on_hierarchy', 'WC1Q25_10': 'emphasis_on_competition_and_performance', 'WC1Q25_11': 'emphasis_on_expertise_and_ability', 'WC1Q25_12': 'performance_based_evaluation', 'WC1Q26_01': 'satisfaction_with_job_content', 'WC1Q26_02': 'satisfaction_with_salary', 'WC1Q26_03': 'satisfaction_with_relationships', 'WC1Q26_04': 'overall_job_satisfaction', 'WC1Q27_01': 'considering_leaving_for_better_conditions', 'WC1Q27_02': 'identifying_with_company_problems', 'WC1Q27_03': 'losing_much_if_leaving_company', 'WC1Q27_04': 'company_deserving_loyalty', 'WC1Q27_05': 'unfair_development_and_promotion_opportunities', 'WC1Q27_06': 'feeling_helpless_or_tired', 'WC1Q27_07': 'increased_job_tension', 'WC1DQ01': 'gender', 'WC1DQ02_01': 'birth_year', 'WC1DQ02_02': 'birth_month', 'WC1DQ03': 'marital_status', 'WC1DQ04': 'education_level', 'WC1DQ04_01': 'major' }) # 관련 있는 변수들의 평균 계산 df['hr_role_avg'] = df[['contribution_to_strategy', 'influence_on_decision_making', 'role_in_hr_improvement', 'leading_change_and_innovation']].mean(axis=1) df['hr_trust_avg'] = df[['educating_managers', 'explaining_hr_policies', 'trusted_by_employees', 'expertise_in_field']].mean(axis=1) df['training_avg'] = df[['sufficient_training', 'equal_training_opportunities', 'desire_for_training', 'job_relevance_of_training', 'applicability_of_training', 'versatility_of_training']].mean(axis=1) df['informal_learning_avg'] = df[['peer_coaching_mentoring', 'supervisor_coaching_mentoring', 'knowledge_sharing_among_employees', 'knowledge_sharing_through_platform', 'learning_through_job_rotation']].mean(axis=1) df['self_directed_learning_avg'] = df[['support_for_academy_tuition', 'support_for_domestic_university_tuition', 'support_for_domestic_graduate_school_tuition', 'support_for_overseas_graduate_degree']].mean(axis=1) df['organizational_characteristics_avg'] = df[['active_problem_solving', 'diverse_experience_opportunities', 'autonomy_in_work', 'cooperation_and_trust', 'diverse_competencies_and_experience', 'good_communication']].mean(axis=1) df['management_emphasis_avg'] = df[['preferential_treatment_of_talent', 'clear_vision_for_hr_development', 'emphasis_on_talent', 'diverse_training_methods']].mean(axis=1) df['organizational_communication_avg'] = df[['informing_employees', 'freedom_to_express_opinions', 'interdepartmental_communication', 'trust_among_colleagues', 'fair_evaluation_and_compensation', 'trustworthy_management']].mean(axis=1) df['organizational_culture_avg'] = df[['encouraging_change_and_innovation', 'rewarding_innovation', 'favoring_creativity_over_diligence', 'family_like_atmosphere', 'emphasis_on_unity', 'emphasis_on_teamwork', 'emphasis_on_formal_procedures', 'top_down_communication', 'emphasis_on_hierarchy', 'emphasis_on_competition_and_performance', 'emphasis_on_expertise_and_ability', 'performance_based_evaluation']].mean(axis=1) df['job_satisfaction_avg'] = df[['satisfaction_with_job_content', 'satisfaction_with_salary', 'satisfaction_with_relationships', 'overall_job_satisfaction']].mean(axis=1) df['organizational_commitment_avg'] = df[['considering_leaving_for_better_conditions', 'identifying_with_company_problems', 'losing_much_if_leaving_company', 'company_deserving_loyalty', 'unfair_development_and_promotion_opportunities', 'feeling_helpless_or_tired', 'increased_job_tension']].mean(axis=1) # 새로운 데이터프레임 저장 df.to_excel('ChatGPT_Sample_Data_Encoded_0518_processed.xlsx', index=False)
Reaction
Comment
Share
커맨드스페이스
데이터 준비
from google.colab import drive import pandas as pd # 구글 드라이브 마운트 drive.mount('/content/drive') # 구글 드라이브의 Data 폴더에서 엑셀 파일 읽어오기 df = pd.read_excel('/content/drive/My Drive/Data/ChatGPT_Sample_Data_Encoded_0518.xlsx')
Reaction
Comment
Share
Share
커맨드스페이스
랜덤 포레스트
# 랜덤 포레스트 모델 학습 rf = RandomForestRegressor(n_estimators=100, random_state=42) rf.fit(X_train, y_train) # 테스트 데이터 예측 y_pred_rf = rf.predict(X_test) # 모델 평가 mse_rf = mean_squared_error(y_test, y_pred_rf) r2_rf = r2_score(y_test, y_pred_rf) print("Random Forest:") print("Mean Squared Error:", mse_rf) print("R-squared:", r2_rf)
👍