# -*- coding: utf-8 -*-
import os, random, numpy as np, pandas as pd
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score

SEED = 42
os.environ['PYTHONHASHSEED'] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)

TRAIN_CSV = 'train.csv'
TEST_CSV  = 'test.csv'

train = pd.read_csv(TRAIN_CSV)
test  = pd.read_csv(TEST_CSV)

SPEND_COLS = ['RoomService','FoodCourt','ShoppingMall','Spa','VRDeck']
BOOL_COLS  = ['CryoSleep','VIP']
CAT_COLS   = ['HomePlanet','Destination','Deck','Side']
ID_COL     = 'PassengerId'
TARGET_COL = 'Transported'

def build_features(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    for col in BOOL_COLS:
        if col in out.columns:
            out[col] = out[col].map({True:1, False:0})
    out['TotalSpend'] = out[SPEND_COLS].sum(axis=1, skipna=True)
    mask_ct = (out['CryoSleep'] == 1)
    out.loc[mask_ct, SPEND_COLS] = 0
    cryo_na = out['CryoSleep'].isna()
    out.loc[cryo_na & (out['TotalSpend'].fillna(0) == 0), 'CryoSleep'] = 1
    out.loc[cryo_na & (out['TotalSpend'].fillna(0) != 0), 'CryoSleep'] = 0
    cab = out['Cabin'].fillna('Unknown').str.split('/', expand=True)
    out['Deck'] = cab[0].fillna('Unknown')
    out['Side'] = cab[2].fillna('Unknown')
    pid = out[ID_COL].str.split('_', expand=True)
    out['Group'] = pid[0]
    out['GroupOrder'] = pid[1].astype(int)
    for col in ['Age'] + SPEND_COLS + ['TotalSpend']:
        out[col] = pd.to_numeric(out[col], errors='coerce').fillna(0)
    out['VIP'] = out['VIP'].fillna(0)
    for col in ['HomePlanet','Destination','Deck','Side']:
        out[col] = out[col].fillna('Unknown')
    out['TotalSpend'] = out[SPEND_COLS].sum(axis=1, skipna=True)
    out['GroupSize'] = 1
    out['IsAlone'] = 0
    out['HasSpend'] = (out['TotalSpend']>0).astype(int)
    return out

train['_is_train']=1
test['_is_train']=0
df_all = pd.concat([train, test], ignore_index=True)
df_all = build_features(df_all)
df_all['GroupSize'] = df_all.groupby('Group')[ID_COL].transform('count')
df_all['IsAlone'] = (df_all['GroupSize']==1).astype(int)
df_all = pd.get_dummies(df_all, columns=CAT_COLS, drop_first=False)
for c in ['Name','Cabin','Group']:
    if c in df_all.columns:
        df_all.drop(columns=[c], inplace=True)

train_df = df_all[df_all['_is_train']==1].copy()
test_df  = df_all[df_all['_is_train']==0].copy()

y = train_df[TARGET_COL].map({True:1, False:0}).astype(int)
X = train_df.drop(columns=[TARGET_COL,'_is_train'])
X_test = test_df.drop(columns=[TARGET_COL,'_is_train'], errors='ignore').reindex(columns=X.columns, fill_value=0)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
oof = np.zeros(len(X), dtype=float)
test_proba = np.zeros(len(X_test), dtype=float)
scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]
    model = HistGradientBoostingClassifier(max_iter=220, learning_rate=0.07, random_state=SEED)
    model.fit(X_tr, y_tr)
    va_p = model.predict_proba(X_va)[:,1]
    oof[va_idx] = va_p
    acc = accuracy_score(y_va, (va_p>=0.5).astype(int))
    scores.append(acc)
    print(f'[Fold {fold}] Accuracy: {acc:.5f}')
    test_proba += model.predict_proba(X_test)[:,1] / skf.n_splits

print(f'CV mean±std: {np.mean(scores):.5f} ± {np.std(scores):.5f}')
pred = (test_proba>=0.5).astype(int)
submission = pd.DataFrame({'PassengerId': test[ID_COL], 'Transported': pd.Series(pred).map({1:True,0:False})})
submission.to_csv('submission_gbdt_cv_seed42.csv', index=False)
print('Saved: submission_gbdt_cv_seed42.csv')
