i am fairly new to R and Python. I like to perform multiple regression using Akaike Information Criterion for variable selection and to evaluate my criterion.
I have written some code to select my variables using the F Statistic P value. The dataset consists of information of housing prices
i am planning to regress the variables (i.e., xvar) onto resale price (i.e., yvar).
instead of using minpv to select my variables, i would like to use the AIC to select the variables.
Would also like to evaluate the resale price criterion using AIC instead of fpv
# Multiple Regression Variable Selection
def mr(selection=False):
import os
os.chdir(r'C:\Users\Path')
import pandas as pd
h=pd.read_csv('Dataset.csv',index_col=0)
#print(h.head(0)) # dataset's variable names
yvar='resale_price'
modeleq = yvar + ' ~'
for xvar in ( # Insert new 'x variable' into a row, ending with ','
'storey_range_lower',
'storey_range_lower_rt',
'storey_range_lower_sq',
'storey_range_upper',
'storey_range_upper_rt',
'storey_range_upper_sq',
'floor_area_sqm',
'floor_area_sqm_rt',
'floor_area_sqm_sq',
'lease_commence_year',
'lease_commence_year_rt',
'lease_commence_year_sq',
'transaction_month',
'transaction_month_rt',
'transaction_month_sq',
'town',
'flat_model',
'flat_type',
'no_of_rooms',
'block_number',
'block_number_rt',
'block_number_sq',
'postal_code',
'postal_code_rt',
'postal_code_sq',
'postal_code_2digit',
'postal_code_2digit_rt',
'postal_code_2digit_sq',
):
if modeleq[-1] == '~':
modeleq = modeleq + ' ' + xvar
else:
modeleq = modeleq + ' + ' + xvar
#import matplotlib.pyplot as pl
#%matplotlib inline
#import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
bmodeleq=modeleq
if selection :
print('Variable Selection using p-value & PR(>F):')
minfpv = 1.0
while True :
#Specify C() for Categorical, else could be interpreted as numeric:
#hout=ols('resale_price ~ floor_area_sqm + C(flat_type)', data=h).fit()
hout=ols(modeleq, data=h).fit()
if modeleq.find(' + ') == -1 :
# 1 xvar left
break
#print(dir(hout)) gives all the attributes of .fit(), e.g. .fvalue & .f_pvalue
fpv=hout.f_pvalue
if fpv < minfpv :
minfpv=fpv
bmodeleq=modeleq
print('\nF-statistic =',hout.fvalue,' PR(>F) =',fpv)
prf = sm.stats.anova_lm(hout, typ=3)['PR(>F)']
maxp=max(prf[1:])
#print('\n',dict(prf))
xdrop = prf[maxp==prf].axes[0][0] # 1st element of row-label .axes[0]
#if xdrop.find('Intercept') != -1 :
# break
# xdrop removed from model equation:
if (modeleq.find('~ ' + xdrop + ' + ') != -1):
modeleq = modeleq.replace('~ ' + xdrop + ' + ','~ ')
elif (modeleq.find('+ ' + xdrop + ' + ') != -1):
modeleq = modeleq.replace('+ ' + xdrop + ' + ','+ ')
else:
modeleq = modeleq.replace(' + ' + xdrop,'')
#print('Model equation:',modeleq,'\n')
print('Variable to drop:',xdrop,' p-value =',prf[xdrop])
#print('\nVariable left:\n'+str(prf[maxp!=prf][:-1]),'\n')
print('\nF-statistic =',hout.fvalue,' PR(>F) =',hout.f_pvalue)
print('Variable left:\n'+str(prf[maxp!=prf][:-1]),'\n')
#input("found intercept")
print('Best model equation:',bmodeleq)
print('Minimum PR(>F) =',minfpv,'\n')
hout=ols(bmodeleq, data=h).fit()
print(sm.stats.anova_lm(hout, typ=1))
#print(anova) # Anova table with 'Treatment' broken up
hsum=hout.summary()
print('\n',hsum)
last=3 #number of bottom p-values to display with more precision
#p-values are not in general the same as PR(>F) from ANOVA
print("\nLast",last,"x-coefficients' p-values:")
nxvar=len(hout.pvalues)
for i in range(last,0,-1):
print(' ',hout.pvalues.axes[0][nxvar-i],' ',hout.pvalues[nxvar-i])
# Output Coefficient table:
#from IPython.core.display import HTML
#HTML(hout.summary().tables[1].as_html()) #.tables[] from 0 to 3
mr(True) # do Variable Selection
#mr() # do multiple regression once
Appreciate any form of help i can get and thank you in advance!!
In your codes,
Line 66
gives the clues on answering your question.hout
has anaic
attribute that you can call usinghout.aic
The straight-out answer is to use
hout.aic
instead ofhout.f_pvalue
forLine 67
.However, you need to re-specify the initial check value
minfpv
since 1.0 would be too small for AIC in this case. That is forLine 56
.Try it out and see what the initial
minfpv
should be.Neo
:)