home-credit-modeling (1).ipynb
#모델 훈련
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(42)
batch_size = 64
num_epochs = 10
loss_history = []
X = data.drop(["target", "case_id", "MONTH"], axis = 1)
y = data["target"]
X_train_tensor = torch.tensor(X.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y.values, dtype=torch.float32)
train_ds = TensorDataset(X_train_tensor, y_train_tensor)
train_dl = DataLoader(train_ds, batch_size, shuffle=True)
for epoch in range(num_epochs):
epoch_loss = 0
accuracy_hist_train = 0
for x_batch, y_batch in train_dl:
# x_batch : WEEK_NUM이 존재 / x_batch_not_week : WEEK_NUM이 존재 X
x_batch_not_week = torch.cat((x_batch[:, :3], x_batch[:, 4:]), dim=1)
week = x_batch[:, 3]
pred = model(x_batch_not_week)
# score
pred_int = pred.round().int()
print("not_week", x_batch_not_week.shape)
print("batch:", x_batch.shape)
print("week:", week.shape)
print("pred: ", pred_int.shape)
print("y_batch:", y_batch.shape)
# score column
# pred_int -> 1차원
pred_int = pred_int.view(-1)
base = torch.cat([week, pred_int, y_batch], dim = 0)
print(base.shape)
############ 수정 필요
base = base.view(3, )
# 텐서를 numpy 배열로 변환
numpy_array = base.numpy()
df_base = pd.DataFrame(numpy_array, columns=['WEEK_NUM', 'score', 'target'])
# gini 점수
score = gini_stability(df_base)
y_batch = y_batch.float()
loss = loss_fn(pred, y_batch)
epoch_loss += loss.item()
loss.backward()
optimizer.step()
optimizer.zero_grad()
is_correct = (torch.argmax(pred, dim=1) == y_batch.long()).float()
accuracy_hist_train += is_correct.sum()
epoch_loss /= len(train_dl.dataset)
loss_history.append(epoch_loss)
accuracy_hist_train /= len(train_dl.dataset)
print(f'epoch {epoch} accuracy {accuracy_hist_train:.4f} loss {epoch_loss:.4f}')
# 폴드 작성, 검증
cv = StratifiedGroupKFold(n_splits = 5)
for train_idx, valid_idx in cv.split(X, y, groups = weeks):
X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
X_valid, y_valid = X.iloc[valid_idx], y.iloc[valid_idx]
# base data 생성(week = WEEK_NUM)
week = weeks.iloc[valid_idx]
df_base = pd.concat([X_valid, week], axis = 1)
# 모델 생성
model = lgb.LGBMClassifier()
# 모델 학습
model.fit(X_train, y_train)
pred = model.predict_proba(X_valid)[:, 1]
df_base['score'] = model.predict(X_valid)
# base + target(target = y_valid)
df_base = pd.concat([df_base, y_valid], axis = 1)
score = gini_stability(df_base)
#score = roc_auc_score(y_valid, pred)
# 리스트에 저장
score_folds.append(score)
from catboost import Pool
for idx_train, idx_valid in cv.split(X, y, groups=weeks):
X_train, y_train = X.iloc[idx_train], y.iloc[idx_train]
X_valid, y_valid = X.iloc[idx_valid], y.iloc[idx_valid]
X_train[CATS] = X_train[CATS].astype("category")
X_valid[CATS] = X_valid[CATS].astype("category")
pool_train = Pool(X_train[CATS], y_train,cat_features=CATS)
pool_valid = Pool(X_valid[CATS], y_valid,cat_features=CATS)
# cat 모델
model_1 = cat.CatBoostClassifier(eval_metric='AUC')
model_1.fit(pool_train, eval_set = pool_valid,verbose=300)
y_pred_valid = model_1.predict_proba(X_valid)[:, 1]
X_valid['score'] = model_1.predict(X_vaild)
X_valid['WEEK_NUM'] = weeks.iloc[idx_valid]
base = pd.concat([X_valid, y_valid], axis = 1)
base = pd.concat([base, X_valid['score']], axis = 1)
score = gini_stability(base)
gini_score.append(score)
auc_score = roc_auc_score(y_valid, y_pred_valid)
cv_scores_xgb.append(auc_score)