# -*- coding: utf-8 -*-
"""
Spaceship Titanic : CatBoost CPU 決定版（再現性重視・ベンチCPU設定）
============================================================
このスクリプトは Kaggle Spaceship Titanic コンペ用の CatBoost モデルです。
・完全な再現性を保証（seed固定、スレッド固定、PassengerId順にソート）
・CPU向けの安定パラメータ（rsm未指定・l2=3.0・subsample=0.9）
・5-Fold StratifiedKFold による交差検証＋soft-vote平均
Public LB: 約0.80640（2025年時点）
"""

# ------------------------------------------------------------
# ① ライブラリ設定：再現性のためのスレッド数固定
# ------------------------------------------------------------
import os
os.environ["OMP_NUM_THREADS"] = "1"        # OpenMP並列の固定
os.environ["MKL_NUM_THREADS"] = "1"        # Intel MKLのスレッド固定
os.environ["OPENBLAS_NUM_THREADS"] = "1"   # OpenBLASのスレッド固定
os.environ["VECLIB_MAXIMUM_THREADS"] = "1" # macOS用
os.environ["NUMEXPR_NUM_THREADS"] = "1"    # numexpr用
os.environ["PYTHONHASHSEED"] = "42"        # Pythonハッシュの再現性

# ------------------------------------------------------------
# ② ライブラリ読み込み
# ------------------------------------------------------------
import random
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# 乱数固定
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

# ------------------------------------------------------------
# ③ データ読み込み＆PassengerId順に並べ替え
# ------------------------------------------------------------
TRAIN_CSV, TEST_CSV = "train.csv", "test.csv"
train = pd.read_csv(TRAIN_CSV).sort_values("PassengerId").reset_index(drop=True)
test  = pd.read_csv(TEST_CSV ).sort_values("PassengerId").reset_index(drop=True)

# カラム構成
SPEND_COLS = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
BOOL_COLS  = ["CryoSleep","VIP"]
CAT_COLS   = ["HomePlanet","Destination","Deck","Side"]
ID_COL, TARGET_COL = "PassengerId", "Transported"

# ------------------------------------------------------------
# ④ 特徴量エンジニアリング関数
# ------------------------------------------------------------
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    # --- 真偽値列を 0/1 に変換 ---
    for c in BOOL_COLS:
        if c in out.columns:
            out[c] = out[c].map({True:1, False:0})

    # --- 支出合計（TotalSpend） ---
    out["TotalSpend"] = out[SPEND_COLS].sum(axis=1, skipna=True)

    # --- CryoSleepがTrueの人は支出ゼロ ---
    mask_ct = (out["CryoSleep"] == 1)
    out.loc[mask_ct, SPEND_COLS] = 0

    # --- CryoSleep欠損をTotalSpendで補完 ---
    cryo_na = out["CryoSleep"].isna()
    out.loc[cryo_na & (out["TotalSpend"].fillna(0)==0), "CryoSleep"] = 1
    out.loc[cryo_na & (out["TotalSpend"].fillna(0)!=0), "CryoSleep"] = 0

    # --- Cabin列を分解して Deck / Side を作成 ---
    cab = out["Cabin"].fillna("Unknown").str.split("/", expand=True)
    out["Deck"] = cab[0].fillna("Unknown")
    out["Side"] = cab[2].fillna("Unknown")

    # --- PassengerId から Group と GroupOrder を抽出 ---
    pid = out[ID_COL].str.split("_", expand=True)
    out["Group"] = pid[0]
    out["GroupOrder"] = pid[1].astype(int)

    # --- 数値列の欠損補完 ---
    for c in ["Age"] + SPEND_COLS + ["TotalSpend"]:
        out[c] = pd.to_numeric(out[c], errors="coerce").fillna(0)

    # --- VIP欠損はFalse（0）扱い ---
    out["VIP"] = out["VIP"].fillna(0)

    # --- カテゴリ列を明示的に文字列型へ統一 ---
    for c in CAT_COLS:
        out[c] = out[c].astype("string").fillna("Unknown").astype(str)

    # --- 再度支出合計を再計算 ---
    out["TotalSpend"] = out[SPEND_COLS].sum(axis=1, skipna=True)

    # --- 家族関連の簡易特徴 ---
    out["GroupSize"] = 1
    out["IsAlone"] = 0

    # --- 支出有無＆年齢当たり支出比 ---
    out["HasSpend"] = (out["TotalSpend"] > 0).astype(int)
    out["SpendPerAge"] = out["TotalSpend"] / (out["Age"] + 1.0)

    return out

# ------------------------------------------------------------
# ⑤ train/test 結合 → GroupSize/IsAlone再計算
# ------------------------------------------------------------
train["_is_train"] = 1
test["_is_train"]  = 0
df_all = pd.concat([train, test], ignore_index=True)

df_all = build_features(df_all)

# Group単位の特徴量
df_all["GroupSize"] = df_all.groupby("Group")[ID_COL].transform("count")
df_all["IsAlone"] = (df_all["GroupSize"] == 1).astype(int)

# 不要列の削除
for c in ["Name","Cabin","Group", ID_COL]:
    if c in df_all.columns:
        df_all.drop(columns=[c], inplace=True)

# ------------------------------------------------------------
# ⑥ 学習データとテストデータを分離
# ------------------------------------------------------------
train_df = df_all[df_all["_is_train"] == 1].copy()
test_df  = df_all[df_all["_is_train"] == 0].copy()
y = train_df[TARGET_COL].map({True:1, False:0}).astype(int)

train_df.drop(columns=[TARGET_COL,"_is_train"], inplace=True)
test_df.drop(columns=["_is_train"], inplace=True)

# カテゴリ列を再度明示的に文字列へ（安全策）
for c in CAT_COLS:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("string").astype(str)
    if c in test_df.columns:
        test_df[c] = test_df[c].astype("string").astype(str)

# 列順を完全一致させる
test_df = test_df.reindex(columns=train_df.columns, fill_value=np.nan)
cat_feat_names = [c for c in CAT_COLS if c in train_df.columns]

# ------------------------------------------------------------
# ⑦ Stratified K-Fold (5分割)
# ------------------------------------------------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
fold_indices = list(skf.split(train_df, y))

# ------------------------------------------------------------
# ⑧ 5-Fold学習ループ
# ------------------------------------------------------------
oof = np.zeros(len(train_df))
test_proba = np.zeros(len(test_df))
scores = []

for fold, (tr_idx, va_idx) in enumerate(fold_indices, 1):
    X_tr, X_va = train_df.iloc[tr_idx], train_df.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    pool_tr = Pool(X_tr, y_tr, cat_features=cat_feat_names)
    pool_va = Pool(X_va, y_va, cat_features=cat_feat_names)
    pool_te = Pool(test_df,        cat_features=cat_feat_names)

    # ---- CatBoostモデル設定 ----
    model = CatBoostClassifier(
        loss_function="Logloss", eval_metric="Logloss",
        learning_rate=0.06, depth=6,
        l2_leaf_reg=3.0,                 # CPUベンチ設定
        iterations=4000,                 # 最大木数（ESで早期停止）
        random_seed=SEED,
        boosting_type="Plain",           # シンプルなブースティングタイプ
        bootstrap_type="Bernoulli",      # サンプリング方式
        subsample=0.9,                   # データサンプリング率
        random_strength=1.0,             # 分割のランダム性
        thread_count=1,                  # 再現性のため固定
        verbose=False,
        allow_writing_files=False,
        task_type="CPU"
    )

    # ---- 学習（early stoppingあり） ----
    model.fit(pool_tr, eval_set=pool_va, use_best_model=True, early_stopping_rounds=200)

    # ---- 予測 ----
    va_p = model.predict_proba(pool_va)[:,1]
    oof[va_idx] = va_p
    acc = accuracy_score(y_va, (va_p>=0.5).astype(int))
    scores.append(acc)
    print(f"[Fold {fold}] Acc={acc:.5f} | best_iter={model.get_best_iteration()}")

    # test側確率をfold平均
    test_proba += model.predict_proba(pool_te)[:,1] / len(fold_indices)

# ------------------------------------------------------------
# ⑨ CVスコア表示
# ------------------------------------------------------------
print(f"CV Accuracy mean±std: {np.mean(scores):.5f} ± {np.std(scores):.5f}")

# ------------------------------------------------------------
# ⑩ 提出ファイル生成（0.5閾値固定）
# ------------------------------------------------------------
test_pred = (test_proba >= 0.5).astype(int)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pd.Series(test_pred).map({1:True, 0:False})
})
submission.to_csv("submission_catboost_cpu_cv_deterministic.csv", index=False)
print("Saved: submission_catboost_cpu_cv_deterministic.csv")
