import numpy as np import pandas as pd import statsmodels.api as sm import seaborn import statsmodels.formula.api as smf import matplotlib.pyplot as plt pd .set_option('display.float_format', lambda x:'%.2f'%x) # Đọc dữ liệu data = pd.read_csv('ool.csv', low_memory=False) # Chuyển sang dữ liệu số data['W2_QG_2'] = pd.to_numeric(data['W2_QG_2'], errors='coerce') # Tinh chỉnh câu hỏi nghiên cứu sub1=data[(data['PPAGECT4'] >= 1) &(data['PPAGECT4'] <= 4) & (data['W2_QG_2']==1)] sub2 = sub1[['PPHHSIZE', 'PPHHHEAD']].dropna() print('Mô hình hồi quy logistic') log_reg = smf.logit(formula = 'PPHHHEAD ~ PPHHSIZE', data = sub2).fit() print(log_reg.summary()) ####### ĐÁNH GIÁ MÔ HÌNH LOGISTIC ####### X = sub2[['PPHHSIZE']] y = sub2[['PPHHHEAD']] # Chia dữ liệu thành 2 tập: train và test from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42, shuffle=True) log_reg2 = sm.Logit(y_train, X_train).fit() yhat = log_reg2.predict(X_test) prediction = list(map(round, yhat)) from sklearn.metrics import (confusion_matrix, accuracy_score) # Confusion Matrix cm = confusion_matrix(y_test, prediction) print ("Confusion Matrix : \n", cm) # Test accuracy print('Test accuracy = ', accuracy_score(y_test, prediction)) ############# TEST AGAIN ########################### print('TEST AGAIN') train, test = train_test_split(sub2, test_size=0.3, random_state=42, shuffle=True) log_reg3 = smf.logit(formula = 'PPHHHEAD ~ PPHHSIZE', data = train).fit() yhat = log_reg3.predict(test[['PPHHSIZE']]) prediction = list(map(round, yhat)) cm = confusion_matrix(test[['PPHHHEAD']], prediction) print ("Confusion Matrix : \n", cm) print('Test accuracy = ', accuracy_score(test[['PPHHHEAD']], prediction))