import numpy as np
import pandas as  pd
import statsmodels.api as sm
import seaborn
import statsmodels.formula.api as smf 
import matplotlib.pyplot as plt

pd  .set_option('display.float_format', lambda x:'%.2f'%x)

# Đọc dữ liệu
data = pd.read_csv('ool.csv', low_memory=False)

# Chuyển sang dữ liệu số
data['W2_QG_2'] = pd.to_numeric(data['W2_QG_2'], errors='coerce')

# Tinh chỉnh câu hỏi nghiên cứu
sub1=data[(data['PPAGECT4'] >= 1) &(data['PPAGECT4'] <= 4) & (data['W2_QG_2']==1)]

sub2 = sub1[['PPHHSIZE', 'PPHHHEAD']].dropna()

print('Mô hình hồi quy logistic')
log_reg = smf.logit(formula = 'PPHHHEAD ~ PPHHSIZE', data = sub2).fit()
print(log_reg.summary())

####### ĐÁNH GIÁ MÔ HÌNH LOGISTIC #######
X = sub2[['PPHHSIZE']]
y = sub2[['PPHHHEAD']]

# Chia dữ liệu thành 2 tập: train và test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True)

log_reg2 = sm.Logit(y_train, X_train).fit()

yhat = log_reg2.predict(X_test) 
prediction = list(map(round, yhat))

from sklearn.metrics import (confusion_matrix, accuracy_score)
# Confusion Matrix
cm = confusion_matrix(y_test, prediction)
print ("Confusion Matrix : \n", cm)

# Test accuracy
print('Test accuracy = ', accuracy_score(y_test, prediction))

############# TEST AGAIN ###########################
print('TEST AGAIN')
train, test = train_test_split(sub2, test_size=0.3, random_state=42, shuffle=True)
log_reg3 = smf.logit(formula = 'PPHHHEAD ~ PPHHSIZE', data = train).fit()

yhat = log_reg3.predict(test[['PPHHSIZE']]) 
prediction = list(map(round, yhat))

cm = confusion_matrix(test[['PPHHHEAD']], prediction)
print ("Confusion Matrix : \n", cm)
print('Test accuracy = ', accuracy_score(test[['PPHHHEAD']], prediction))