# -*- coding: utf-8 -*-
"""
Spaceship Titanic — CatBoost CV パイプライン（seed=42, 5fold, soft-vote）
v1.2+ 対応版（deterministic_mode削除）

主なポイント
-------------
- カテゴリ列はすべて文字列に変換
- cat_features は列名で指定（列順依存なし）
- train/test の列順を reindex で完全一致
- seed=42固定、StratifiedKFold(5)、soft-vote 平均
"""

import os, random, numpy as np, pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

# ------------------ 再現性 ------------------
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

# ------------------ IO ------------------
TRAIN_CSV = "train.csv"
TEST_CSV  = "test.csv"
train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

# ------------------ 列定義 ------------------
SPEND_COLS = ["RoomService","FoodCourt","ShoppingMall","Spa","VRDeck"]
BOOL_COLS  = ["CryoSleep","VIP"]
CAT_COLS   = ["HomePlanet","Destination","Deck","Side"]
ID_COL     = "PassengerId"
TARGET_COL = "Transported"

# ------------------ 特徴量エンジニアリング ------------------
def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()

    for col in BOOL_COLS:
        if col in out.columns:
            out[col] = out[col].map({True:1, False:0})

    out["TotalSpend"] = out[SPEND_COLS].sum(axis=1, skipna=True)

    mask_ct = (out["CryoSleep"] == 1)
    out.loc[mask_ct, SPEND_COLS] = 0
    cryo_na = out["CryoSleep"].isna()
    out.loc[cryo_na & (out["TotalSpend"].fillna(0) == 0), "CryoSleep"] = 1
    out.loc[cryo_na & (out["TotalSpend"].fillna(0) != 0), "CryoSleep"] = 0

    cab = out["Cabin"].fillna("Unknown").str.split("/", expand=True)
    out["Deck"] = cab[0].fillna("Unknown")
    out["Side"] = cab[2].fillna("Unknown")

    pid = out[ID_COL].str.split("_", expand=True)
    out["Group"] = pid[0]
    out["GroupOrder"] = pid[1].astype(int)

    for col in ["Age"] + SPEND_COLS + ["TotalSpend"]:
        out[col] = pd.to_numeric(out[col], errors="coerce").fillna(0)

    out["VIP"] = out["VIP"].fillna(0)

    for col in CAT_COLS:
        out[col] = out[col].astype("string").fillna("Unknown").astype(str)

    out["TotalSpend"] = out[SPEND_COLS].sum(axis=1, skipna=True)
    out["GroupSize"] = 1
    out["IsAlone"]   = 0
    out["HasSpend"]    = (out["TotalSpend"] > 0).astype(int)
    out["SpendPerAge"] = out["TotalSpend"] / (out["Age"] + 1.0)
    return out

# ------------------ train/test 結合 ------------------
train["_is_train"] = 1
test["_is_train"]  = 0
df_all = pd.concat([train, test], ignore_index=True)

df_all = build_features(df_all)
df_all["GroupSize"] = df_all.groupby("Group")[ID_COL].transform("count")
df_all["IsAlone"]   = (df_all["GroupSize"] == 1).astype(int)

for c in ["Name","Cabin","Group", ID_COL]:
    if c in df_all.columns:
        df_all.drop(columns=[c], inplace=True)

train_df = df_all[df_all["_is_train"] == 1].copy()
test_df  = df_all[df_all["_is_train"] == 0].copy()
y = train_df[TARGET_COL].map({True:1, False:0}).astype(int)

train_df.drop(columns=[TARGET_COL, "_is_train"], inplace=True)
test_df.drop(columns=["_is_train"], inplace=True)

for c in CAT_COLS:
    if c in train_df.columns:
        train_df[c] = train_df[c].astype("string").astype(str)
    if c in test_df.columns:
        test_df[c] = test_df[c].astype("string").astype(str)

test_df = test_df.reindex(columns=train_df.columns, fill_value=np.nan)
cat_feat_names = [c for c in CAT_COLS if c in train_df.columns]

# ------------------ 5-Fold CV ------------------
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof = np.zeros(len(train_df))
test_proba = np.zeros(len(test_df))
scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(train_df, y), 1):
    X_tr, X_va = train_df.iloc[tr_idx], train_df.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    pool_tr = Pool(X_tr, y_tr, cat_features=cat_feat_names)
    pool_va = Pool(X_va, y_va, cat_features=cat_feat_names)
    pool_te = Pool(test_df, cat_features=cat_feat_names)

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="Logloss",
        learning_rate=0.06,
        depth=6,
        l2_leaf_reg=3.0,
        iterations=4000,
        random_seed=SEED,
        boosting_type="Plain",
        bootstrap_type="Bernoulli",
        subsample=0.9,
        rsm=0.9,
        verbose=False,
        allow_writing_files=False,
        task_type="CPU"  # deterministic_mode の代わり
    )

    model.fit(pool_tr, eval_set=pool_va, use_best_model=True, early_stopping_rounds=200)

    va_p = model.predict_proba(pool_va)[:, 1]
    oof[va_idx] = va_p
    acc = accuracy_score(y_va, (va_p >= 0.5).astype(int))
    scores.append(acc)
    print(f"[Fold {fold}] Accuracy: {acc:.5f} | best_iter={model.get_best_iteration()}")

    test_proba += model.predict_proba(pool_te)[:, 1] / skf.n_splits

print(f"CV Accuracy mean±std: {np.mean(scores):.5f} ± {np.std(scores):.5f}")

test_pred = (test_proba >= 0.5).astype(int)
submission = pd.DataFrame({
    "PassengerId": test["PassengerId"],
    "Transported": pd.Series(test_pred).map({1: True, 0: False})
})
submission.to_csv("submission_catboost_cv_seed42_fixed.csv", index=False)
print("Saved: submission_catboost_cv_seed42_fixed.csv")
