# -*- coding: utf-8 -*-
"""
Spaceship Titanic — LightGBM CV pipeline (seed=42, 5fold, soft-vote)
==================================================
特徴量エンジニアリングからLightGBM学習、5分割CV評価、提出CSV作成までを完全再現できるスクリプト。

主な仕様:
- 再現性確保: 乱数seed=42を統一
- 5-Fold StratifiedKFold で安定評価
- LightGBMで高速学習 (CPU/GPU両対応)
- 各foldの確率平均(soft voting)
- 出力ファイル: submission_lgbm_cv_seed42.csv
"""

import os, random, numpy as np, pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# ==================================================
# 1. シード固定（完全再現性確保）
# ==================================================
SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# ==================================================
# 2. データ読み込み
# ==================================================
TRAIN_CSV = 'train.csv'
TEST_CSV  = 'test.csv'

train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

# ==================================================
# 3. 基本設定
# ==================================================
SPEND_COLS = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
BOOL_COLS  = ['CryoSleep','VIP']
CAT_COLS   = ['HomePlanet','Destination','Deck','Side']
ID_COL     = 'PassengerId'
TARGET_COL = 'Transported'

# ==================================================
# 4. 特徴量エンジニアリング関数
# ==================================================
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    """前処理と特徴量生成を行う関数"""

    out = df.copy()

    # --- Bool列を0/1化（CryoSleep, VIP） ---
    for col in BOOL_COLS:
        if col in out.columns:
            out[col] = out[col].map({True:1, False:0})

    # --- 支出合計 ---
    out['TotalSpend'] = out[SPEND_COLS].sum(axis=1, skipna=True)

    # --- CryoSleepとの連動補完 ---
    mask_ct = (out['CryoSleep'] == 1)
    out.loc[mask_ct, SPEND_COLS] = 0
    cryo_na = out['CryoSleep'].isna()
    out.loc[cryo_na & (out['TotalSpend'].fillna(0) == 0), 'CryoSleep'] = 1
    out.loc[cryo_na & (out['TotalSpend'].fillna(0) != 0), 'CryoSleep'] = 0

    # --- Cabin分解 (Deck / Side) ---
    cab = out['Cabin'].fillna('Unknown').str.split('/', expand=True)
    out['Deck'] = cab[0].fillna('Unknown')
    out['Side'] = cab[2].fillna('Unknown')

    # --- PassengerId → Group / GroupOrder ---
    pid = out[ID_COL].str.split('_', expand=True)
    out['Group'] = pid[0]
    out['GroupOrder'] = pid[1].astype(int)

    # --- 数値列の欠損補完 ---
    for col in ['Age'] + SPEND_COLS + ['TotalSpend']:
        out[col] = pd.to_numeric(out[col], errors='coerce').fillna(0)
    out['VIP'] = out['VIP'].fillna(0)

    # --- カテゴリ列欠損補完 ---
    for col in ['HomePlanet','Destination','Deck','Side']:
        out[col] = out[col].fillna('Unknown')

    # --- 再合計 ---
    out['TotalSpend'] = out[SPEND_COLS].sum(axis=1, skipna=True)

    # --- GroupSize / IsAlone プレースホルダ ---
    out['GroupSize'] = 1
    out['IsAlone'] = 0

    # --- HasSpend: 支出があったか ---
    out['HasSpend'] = (out['TotalSpend']>0).astype(int)

    return out

# ==================================================
# 5. train/test結合 → GroupSize算出
# ==================================================
train['_is_train']=1
test['_is_train']=0
df_all = pd.concat([train, test], ignore_index=True)
df_all = build_features(df_all)

# Group単位でGroupSize再計算
df_all['GroupSize'] = df_all.groupby('Group')[ID_COL].transform('count')
df_all['IsAlone'] = (df_all['GroupSize']==1).astype(int)

# ==================================================
# 6. One-hot encoding（Group列は除外）
# ==================================================
df_all = pd.get_dummies(df_all, columns=CAT_COLS, drop_first=False)

# 不要列を削除
for c in ['Name','Cabin','Group']:
    if c in df_all.columns:
        df_all.drop(columns=[c], inplace=True)

# ==================================================
# 7. train/test分離
# ==================================================
train_df = df_all[df_all['_is_train']==1].copy()
test_df  = df_all[df_all['_is_train']==0].copy()

y = train_df[TARGET_COL].map({True:1, False:0}).astype(int)

# PassengerId・非数値列を除外してLGBM対応型に整える
drop_cols = [TARGET_COL, '_is_train', ID_COL]
train_df = train_df.drop(columns=[c for c in drop_cols if c in train_df.columns])
test_df  = test_df.drop(columns=[c for c in drop_cols if c in test_df.columns], errors='ignore')

X = train_df.select_dtypes(include=['number','bool']).copy().astype('float32')
X_test = test_df.reindex(columns=X.columns, fill_value=0).select_dtypes(include=['number','bool']).copy().astype('float32')

# ==================================================
# 8. StratifiedKFoldでCV学習
# ==================================================
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof = np.zeros(len(X), dtype=float)
test_proba = np.zeros(len(X_test), dtype=float)
scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    # --- LightGBMモデル定義 ---
    model = LGBMClassifier(
        objective='binary',
        learning_rate=0.05,
        n_estimators=1200,
        num_leaves=31,
        max_depth=-1,
        subsample=0.9,
        subsample_freq=1,
        colsample_bytree=0.9,
        min_child_samples=20,
        reg_lambda=0.0,
        reg_alpha=0.0,
        random_state=SEED,
        n_jobs=-1
    )

    # --- モデル学習（verboseは指定しない）---
    model.fit(X_tr, y_tr, eval_set=[(X_va, y_va)], eval_metric='logloss')

    # --- Foldごとの評価 ---
    va_p = model.predict_proba(X_va)[:,1]
    oof[va_idx] = va_p
    acc = accuracy_score(y_va, (va_p>=0.5).astype(int))
    scores.append(acc)
    print(f'[Fold {fold}] Accuracy: {acc:.5f}')

    # --- テスト予測を平均化（soft voting） ---
    test_proba += model.predict_proba(X_test)[:,1] / skf.n_splits

# ==================================================
# 9. CVスコア出力
# ==================================================
print(f'CV mean±std: {np.mean(scores):.5f} ± {np.std(scores):.5f}')

# ==================================================
# 10. 提出ファイル作成
# ==================================================
test_pred = (test_proba>=0.5).astype(int)
submission = pd.DataFrame({
    'PassengerId': test[ID_COL],
    'Transported': pd.Series(test_pred).map({1:True,0:False})
})
submission.to_csv('submission_lgbm_cv_seed42_commented.csv', index=False)
print('Saved: submission_lgbm_cv_seed42.csv')
