import pandas as pd
import numpy as np
import seaborn as sns
import plotly.graph_objects as go
import matplotlib.pyplot as plt
from pandas import set_option
from scipy.stats import norm, skew
#Load data
from google.colab import drive
drive.mount('/content/gdrive')
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code Enter your authorization code: ·········· Mounted at /content/gdrive
df_train = pd.read_csv('gdrive/My Drive/dataset/train.csv')
df_test = pd.read_csv('gdrive/My Drive/dataset/test.csv')
# First inspection
df_train.shape, df_test.shape, df_train.columns.values
((1460, 81), (1459, 80), array(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscFeature', 'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'SalePrice'], dtype=object))
df_train.describe()
Id | MSSubClass | LotFrontage | LotArea | OverallQual | OverallCond | YearBuilt | YearRemodAdd | MasVnrArea | BsmtFinSF1 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | TotRmsAbvGrd | Fireplaces | GarageYrBlt | GarageCars | GarageArea | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | MiscVal | MoSold | YrSold | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 1460.000000 | 1460.000000 | 1201.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1452.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1379.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 | 1460.000000 |
mean | 730.500000 | 56.897260 | 70.049958 | 10516.828082 | 6.099315 | 5.575342 | 1971.267808 | 1984.865753 | 103.685262 | 443.639726 | 46.549315 | 567.240411 | 1057.429452 | 1162.626712 | 346.992466 | 5.844521 | 1515.463699 | 0.425342 | 0.057534 | 1.565068 | 0.382877 | 2.866438 | 1.046575 | 6.517808 | 0.613014 | 1978.506164 | 1.767123 | 472.980137 | 94.244521 | 46.660274 | 21.954110 | 3.409589 | 15.060959 | 2.758904 | 43.489041 | 6.321918 | 2007.815753 | 180921.195890 |
std | 421.610009 | 42.300571 | 24.284752 | 9981.264932 | 1.382997 | 1.112799 | 30.202904 | 20.645407 | 181.066207 | 456.098091 | 161.319273 | 441.866955 | 438.705324 | 386.587738 | 436.528436 | 48.623081 | 525.480383 | 0.518911 | 0.238753 | 0.550916 | 0.502885 | 0.815778 | 0.220338 | 1.625393 | 0.644666 | 24.689725 | 0.747315 | 213.804841 | 125.338794 | 66.256028 | 61.119149 | 29.317331 | 55.757415 | 40.177307 | 496.123024 | 2.703626 | 1.328095 | 79442.502883 |
min | 1.000000 | 20.000000 | 21.000000 | 1300.000000 | 1.000000 | 1.000000 | 1872.000000 | 1950.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 334.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 1900.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 1.000000 | 2006.000000 | 34900.000000 |
25% | 365.750000 | 20.000000 | 59.000000 | 7553.500000 | 5.000000 | 5.000000 | 1954.000000 | 1967.000000 | 0.000000 | 0.000000 | 0.000000 | 223.000000 | 795.750000 | 882.000000 | 0.000000 | 0.000000 | 1129.500000 | 0.000000 | 0.000000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 5.000000 | 0.000000 | 1961.000000 | 1.000000 | 334.500000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 5.000000 | 2007.000000 | 129975.000000 |
50% | 730.500000 | 50.000000 | 69.000000 | 9478.500000 | 6.000000 | 5.000000 | 1973.000000 | 1994.000000 | 0.000000 | 383.500000 | 0.000000 | 477.500000 | 991.500000 | 1087.000000 | 0.000000 | 0.000000 | 1464.000000 | 0.000000 | 0.000000 | 2.000000 | 0.000000 | 3.000000 | 1.000000 | 6.000000 | 1.000000 | 1980.000000 | 2.000000 | 480.000000 | 0.000000 | 25.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 6.000000 | 2008.000000 | 163000.000000 |
75% | 1095.250000 | 70.000000 | 80.000000 | 11601.500000 | 7.000000 | 6.000000 | 2000.000000 | 2004.000000 | 166.000000 | 712.250000 | 0.000000 | 808.000000 | 1298.250000 | 1391.250000 | 728.000000 | 0.000000 | 1776.750000 | 1.000000 | 0.000000 | 2.000000 | 1.000000 | 3.000000 | 1.000000 | 7.000000 | 1.000000 | 2002.000000 | 2.000000 | 576.000000 | 168.000000 | 68.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 0.000000 | 8.000000 | 2009.000000 | 214000.000000 |
max | 1460.000000 | 190.000000 | 313.000000 | 215245.000000 | 10.000000 | 9.000000 | 2010.000000 | 2010.000000 | 1600.000000 | 5644.000000 | 1474.000000 | 2336.000000 | 6110.000000 | 4692.000000 | 2065.000000 | 572.000000 | 5642.000000 | 3.000000 | 2.000000 | 3.000000 | 2.000000 | 8.000000 | 3.000000 | 14.000000 | 3.000000 | 2010.000000 | 4.000000 | 1418.000000 | 857.000000 | 547.000000 | 552.000000 | 508.000000 | 480.000000 | 738.000000 | 15500.000000 | 12.000000 | 2010.000000 | 755000.000000 |
#Head of data of train
df_train.head(5)
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | ... | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | SalePrice | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1 | 60 | RL | 65.0 | 8450 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2003 | 2003 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 196.0 | Gd | TA | PConc | Gd | TA | No | GLQ | 706 | Unf | 0 | 150 | 856 | GasA | ... | Y | SBrkr | 856 | 854 | 0 | 1710 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 8 | Typ | 0 | NaN | Attchd | 2003.0 | RFn | 2 | 548 | TA | TA | Y | 0 | 61 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2008 | WD | Normal | 208500 |
1 | 2 | 20 | RL | 80.0 | 9600 | Pave | NaN | Reg | Lvl | AllPub | FR2 | Gtl | Veenker | Feedr | Norm | 1Fam | 1Story | 6 | 8 | 1976 | 1976 | Gable | CompShg | MetalSd | MetalSd | None | 0.0 | TA | TA | CBlock | Gd | TA | Gd | ALQ | 978 | Unf | 0 | 284 | 1262 | GasA | ... | Y | SBrkr | 1262 | 0 | 0 | 1262 | 0 | 1 | 2 | 0 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1976.0 | RFn | 2 | 460 | TA | TA | Y | 298 | 0 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 5 | 2007 | WD | Normal | 181500 |
2 | 3 | 60 | RL | 68.0 | 11250 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | CollgCr | Norm | Norm | 1Fam | 2Story | 7 | 5 | 2001 | 2002 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 162.0 | Gd | TA | PConc | Gd | TA | Mn | GLQ | 486 | Unf | 0 | 434 | 920 | GasA | ... | Y | SBrkr | 920 | 866 | 0 | 1786 | 1 | 0 | 2 | 1 | 3 | 1 | Gd | 6 | Typ | 1 | TA | Attchd | 2001.0 | RFn | 2 | 608 | TA | TA | Y | 0 | 42 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 9 | 2008 | WD | Normal | 223500 |
3 | 4 | 70 | RL | 60.0 | 9550 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | Crawfor | Norm | Norm | 1Fam | 2Story | 7 | 5 | 1915 | 1970 | Gable | CompShg | Wd Sdng | Wd Shng | None | 0.0 | TA | TA | BrkTil | TA | Gd | No | ALQ | 216 | Unf | 0 | 540 | 756 | GasA | ... | Y | SBrkr | 961 | 756 | 0 | 1717 | 1 | 0 | 1 | 0 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Detchd | 1998.0 | Unf | 3 | 642 | TA | TA | Y | 0 | 35 | 272 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 2 | 2006 | WD | Abnorml | 140000 |
4 | 5 | 60 | RL | 84.0 | 14260 | Pave | NaN | IR1 | Lvl | AllPub | FR2 | Gtl | NoRidge | Norm | Norm | 1Fam | 2Story | 8 | 5 | 2000 | 2000 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 350.0 | Gd | TA | PConc | Gd | TA | Av | GLQ | 655 | Unf | 0 | 490 | 1145 | GasA | ... | Y | SBrkr | 1145 | 1053 | 0 | 2198 | 1 | 0 | 2 | 1 | 4 | 1 | Gd | 9 | Typ | 1 | TA | Attchd | 2000.0 | RFn | 3 | 836 | TA | TA | Y | 192 | 84 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 12 | 2008 | WD | Normal | 250000 |
5 rows × 81 columns
#head of data of test
df_test.head(5)
Id | MSSubClass | MSZoning | LotFrontage | LotArea | Street | Alley | LotShape | LandContour | Utilities | LotConfig | LandSlope | Neighborhood | Condition1 | Condition2 | BldgType | HouseStyle | OverallQual | OverallCond | YearBuilt | YearRemodAdd | RoofStyle | RoofMatl | Exterior1st | Exterior2nd | MasVnrType | MasVnrArea | ExterQual | ExterCond | Foundation | BsmtQual | BsmtCond | BsmtExposure | BsmtFinType1 | BsmtFinSF1 | BsmtFinType2 | BsmtFinSF2 | BsmtUnfSF | TotalBsmtSF | Heating | HeatingQC | CentralAir | Electrical | 1stFlrSF | 2ndFlrSF | LowQualFinSF | GrLivArea | BsmtFullBath | BsmtHalfBath | FullBath | HalfBath | BedroomAbvGr | KitchenAbvGr | KitchenQual | TotRmsAbvGrd | Functional | Fireplaces | FireplaceQu | GarageType | GarageYrBlt | GarageFinish | GarageCars | GarageArea | GarageQual | GarageCond | PavedDrive | WoodDeckSF | OpenPorchSF | EnclosedPorch | 3SsnPorch | ScreenPorch | PoolArea | PoolQC | Fence | MiscFeature | MiscVal | MoSold | YrSold | SaleType | SaleCondition | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 1461 | 20 | RH | 80.0 | 11622 | Pave | NaN | Reg | Lvl | AllPub | Inside | Gtl | NAmes | Feedr | Norm | 1Fam | 1Story | 5 | 6 | 1961 | 1961 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | CBlock | TA | TA | No | Rec | 468.0 | LwQ | 144.0 | 270.0 | 882.0 | GasA | TA | Y | SBrkr | 896 | 0 | 0 | 896 | 0.0 | 0.0 | 1 | 0 | 2 | 1 | TA | 5 | Typ | 0 | NaN | Attchd | 1961.0 | Unf | 1.0 | 730.0 | TA | TA | Y | 140 | 0 | 0 | 0 | 120 | 0 | NaN | MnPrv | NaN | 0 | 6 | 2010 | WD | Normal |
1 | 1462 | 20 | RL | 81.0 | 14267 | Pave | NaN | IR1 | Lvl | AllPub | Corner | Gtl | NAmes | Norm | Norm | 1Fam | 1Story | 6 | 6 | 1958 | 1958 | Hip | CompShg | Wd Sdng | Wd Sdng | BrkFace | 108.0 | TA | TA | CBlock | TA | TA | No | ALQ | 923.0 | Unf | 0.0 | 406.0 | 1329.0 | GasA | TA | Y | SBrkr | 1329 | 0 | 0 | 1329 | 0.0 | 0.0 | 1 | 1 | 3 | 1 | Gd | 6 | Typ | 0 | NaN | Attchd | 1958.0 | Unf | 1.0 | 312.0 | TA | TA | Y | 393 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | Gar2 | 12500 | 6 | 2010 | WD | Normal |
2 | 1463 | 60 | RL | 74.0 | 13830 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 5 | 5 | 1997 | 1998 | Gable | CompShg | VinylSd | VinylSd | None | 0.0 | TA | TA | PConc | Gd | TA | No | GLQ | 791.0 | Unf | 0.0 | 137.0 | 928.0 | GasA | Gd | Y | SBrkr | 928 | 701 | 0 | 1629 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | TA | 6 | Typ | 1 | TA | Attchd | 1997.0 | Fin | 2.0 | 482.0 | TA | TA | Y | 212 | 34 | 0 | 0 | 0 | 0 | NaN | MnPrv | NaN | 0 | 3 | 2010 | WD | Normal |
3 | 1464 | 60 | RL | 78.0 | 9978 | Pave | NaN | IR1 | Lvl | AllPub | Inside | Gtl | Gilbert | Norm | Norm | 1Fam | 2Story | 6 | 6 | 1998 | 1998 | Gable | CompShg | VinylSd | VinylSd | BrkFace | 20.0 | TA | TA | PConc | TA | TA | No | GLQ | 602.0 | Unf | 0.0 | 324.0 | 926.0 | GasA | Ex | Y | SBrkr | 926 | 678 | 0 | 1604 | 0.0 | 0.0 | 2 | 1 | 3 | 1 | Gd | 7 | Typ | 1 | Gd | Attchd | 1998.0 | Fin | 2.0 | 470.0 | TA | TA | Y | 360 | 36 | 0 | 0 | 0 | 0 | NaN | NaN | NaN | 0 | 6 | 2010 | WD | Normal |
4 | 1465 | 120 | RL | 43.0 | 5005 | Pave | NaN | IR1 | HLS | AllPub | Inside | Gtl | StoneBr | Norm | Norm | TwnhsE | 1Story | 8 | 5 | 1992 | 1992 | Gable | CompShg | HdBoard | HdBoard | None | 0.0 | Gd | TA | PConc | Gd | TA | No | ALQ | 263.0 | Unf | 0.0 | 1017.0 | 1280.0 | GasA | Ex | Y | SBrkr | 1280 | 0 | 0 | 1280 | 0.0 | 0.0 | 2 | 0 | 2 | 1 | Gd | 5 | Typ | 0 | NaN | Attchd | 1992.0 | RFn | 2.0 | 506.0 | TA | TA | Y | 0 | 82 | 0 | 0 | 144 | 0 | NaN | NaN | NaN | 0 | 1 | 2010 | WD | Normal |
df_train.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1460 entries, 0 to 1459 Data columns (total 81 columns): Id 1460 non-null int64 MSSubClass 1460 non-null int64 MSZoning 1460 non-null object LotFrontage 1201 non-null float64 LotArea 1460 non-null int64 Street 1460 non-null object Alley 91 non-null object LotShape 1460 non-null object LandContour 1460 non-null object Utilities 1460 non-null object LotConfig 1460 non-null object LandSlope 1460 non-null object Neighborhood 1460 non-null object Condition1 1460 non-null object Condition2 1460 non-null object BldgType 1460 non-null object HouseStyle 1460 non-null object OverallQual 1460 non-null int64 OverallCond 1460 non-null int64 YearBuilt 1460 non-null int64 YearRemodAdd 1460 non-null int64 RoofStyle 1460 non-null object RoofMatl 1460 non-null object Exterior1st 1460 non-null object Exterior2nd 1460 non-null object MasVnrType 1452 non-null object MasVnrArea 1452 non-null float64 ExterQual 1460 non-null object ExterCond 1460 non-null object Foundation 1460 non-null object BsmtQual 1423 non-null object BsmtCond 1423 non-null object BsmtExposure 1422 non-null object BsmtFinType1 1423 non-null object BsmtFinSF1 1460 non-null int64 BsmtFinType2 1422 non-null object BsmtFinSF2 1460 non-null int64 BsmtUnfSF 1460 non-null int64 TotalBsmtSF 1460 non-null int64 Heating 1460 non-null object HeatingQC 1460 non-null object CentralAir 1460 non-null object Electrical 1459 non-null object 1stFlrSF 1460 non-null int64 2ndFlrSF 1460 non-null int64 LowQualFinSF 1460 non-null int64 GrLivArea 1460 non-null int64 BsmtFullBath 1460 non-null int64 BsmtHalfBath 1460 non-null int64 FullBath 1460 non-null int64 HalfBath 1460 non-null int64 BedroomAbvGr 1460 non-null int64 KitchenAbvGr 1460 non-null int64 KitchenQual 1460 non-null object TotRmsAbvGrd 1460 non-null int64 Functional 1460 non-null object Fireplaces 1460 non-null int64 FireplaceQu 770 non-null object GarageType 1379 non-null object GarageYrBlt 1379 non-null float64 GarageFinish 1379 non-null object GarageCars 1460 non-null int64 GarageArea 1460 non-null int64 GarageQual 1379 non-null object GarageCond 1379 non-null object PavedDrive 1460 non-null object WoodDeckSF 1460 non-null int64 OpenPorchSF 1460 non-null int64 EnclosedPorch 1460 non-null int64 3SsnPorch 1460 non-null int64 ScreenPorch 1460 non-null int64 PoolArea 1460 non-null int64 PoolQC 7 non-null object Fence 281 non-null object MiscFeature 54 non-null object MiscVal 1460 non-null int64 MoSold 1460 non-null int64 YrSold 1460 non-null int64 SaleType 1460 non-null object SaleCondition 1460 non-null object SalePrice 1460 non-null int64 dtypes: float64(3), int64(35), object(43) memory usage: 924.0+ KB
df = df_train.append(df_test, ignore_index = True)
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py:7138: FutureWarning: Sorting because non-concatenation axis is not aligned. A future version of pandas will change to not sort by default. To accept the future behavior, pass 'sort=False'. To retain the current behavior and silence the warning, pass 'sort=True'.
print(df)
1stFlrSF 2ndFlrSF 3SsnPorch ... YearBuilt YearRemodAdd YrSold 0 856 854 0 ... 2003 2003 2008 1 1262 0 0 ... 1976 1976 2007 2 920 866 0 ... 2001 2002 2008 3 961 756 0 ... 1915 1970 2006 4 1145 1053 0 ... 2000 2000 2008 ... ... ... ... ... ... ... ... 2914 546 546 0 ... 1970 1970 2006 2915 546 546 0 ... 1970 1970 2006 2916 1224 0 0 ... 1960 1996 2006 2917 970 0 0 ... 1992 1992 2006 2918 996 1004 0 ... 1993 1994 2006 [2919 rows x 81 columns]
print(df.SalePrice)
0 208500.0 1 181500.0 2 223500.0 3 140000.0 4 250000.0 5 143000.0 6 307000.0 7 200000.0 8 129900.0 9 118000.0 10 129500.0 11 345000.0 12 144000.0 13 279500.0 14 157000.0 15 132000.0 16 149000.0 17 90000.0 18 159000.0 19 139000.0 20 325300.0 21 139400.0 22 230000.0 23 129900.0 24 154000.0 25 256300.0 26 134800.0 27 306000.0 28 207500.0 29 68500.0 ... 2889 NaN 2890 NaN 2891 NaN 2892 NaN 2893 NaN 2894 NaN 2895 NaN 2896 NaN 2897 NaN 2898 NaN 2899 NaN 2900 NaN 2901 NaN 2902 NaN 2903 NaN 2904 NaN 2905 NaN 2906 NaN 2907 NaN 2908 NaN 2909 NaN 2910 NaN 2911 NaN 2912 NaN 2913 NaN 2914 NaN 2915 NaN 2916 NaN 2917 NaN 2918 NaN Name: SalePrice, Length: 2919, dtype: float64
#Log-Transfrom SalesPrice. The final result will be predicted by Root Mean Square Error (RMSE) logarithm
df.SalePrice = np.log(df.SalePrice)
# Separate data to numerical ("quantitive") and categorical ("qualities")
quan = list(df_test.loc[:,df_test.dtypes != 'object'].drop('Id',axis=1).columns.values)
qual = list(df_test.loc[:,df_test.dtypes == 'object'].columns.values)
# Cleanse Data: Find out how many missing values in quan and qual
hasNAN = df[quan].isnull().sum()
hasNAN = hasNAN[hasNAN > 0]
hasNAN = hasNAN.sort_values(ascending=False)
print(hasNAN)
print('**'*42)
hasNAN = df[qual].isnull().sum()
hasNAN = hasNAN[hasNAN > 0]
hasNAN = hasNAN.sort_values(ascending=False)
print(hasNAN)
LotFrontage 486 GarageYrBlt 159 MasVnrArea 23 BsmtHalfBath 2 BsmtFullBath 2 GarageArea 1 GarageCars 1 TotalBsmtSF 1 BsmtUnfSF 1 BsmtFinSF2 1 BsmtFinSF1 1 dtype: int64 ************************************************************************************ PoolQC 2909 MiscFeature 2814 Alley 2721 Fence 2348 FireplaceQu 1420 GarageCond 159 GarageQual 159 GarageFinish 159 GarageType 157 BsmtExposure 82 BsmtCond 82 BsmtQual 81 BsmtFinType2 80 BsmtFinType1 79 MasVnrType 24 MSZoning 4 Functional 2 Utilities 2 Exterior1st 1 SaleType 1 Exterior2nd 1 KitchenQual 1 Electrical 1 dtype: int64
#Filling missing values for numeric data. Most of NAN are facilities/structures are not existed
df.LotFrontage.fillna(df.LotFrontage.median(), inplace=True)
df.GarageYrBlt.fillna(0, inplace=True)
df.MasVnrArea.fillna(0, inplace=True)
df.BsmtHalfBath.fillna(0, inplace=True)
df.BsmtFullBath.fillna(0, inplace=True)
df.GarageArea.fillna(0, inplace=True)
df.GarageCars.fillna(0, inplace=True)
df.TotalBsmtSF.fillna(0, inplace=True)
df.BsmtUnfSF.fillna(0, inplace=True)
df.BsmtFinSF2.fillna(0, inplace=True)
df.BsmtFinSF1.fillna(0, inplace=True)
# Filling missing values for categories features
df.PoolQC.fillna('NA', inplace=True)
df.MiscFeature.fillna('NA', inplace=True)
df.Alley.fillna('NA', inplace=True)
df.Fence.fillna('NA', inplace=True)
df.FireplaceQu.fillna('NA', inplace=True)
df.GarageCond.fillna('NA', inplace=True)
df.GarageQual.fillna('NA', inplace=True)
df.GarageFinish.fillna('NA', inplace=True)
df.GarageType.fillna('NA', inplace=True)
df.BsmtExposure.fillna('NA', inplace=True)
df.BsmtCond.fillna('NA', inplace=True)
df.BsmtQual.fillna('NA', inplace=True)
df.BsmtFinType2.fillna('NA', inplace=True)
df.BsmtFinType1.fillna('NA', inplace=True)
df.MasVnrType.fillna('None', inplace=True)
df.Exterior2nd.fillna('None', inplace=True)
# Filling mode for general properties that all houses should have
df.Functional.fillna(df.Functional.mode()[0], inplace = True)
df.Utilities.fillna(df.Utilities.mode()[0], inplace=True)
df.Exterior1st.fillna(df.Exterior1st.mode()[0], inplace=True)
df.SaleType.fillna(df.SaleType.mode()[0], inplace=True)
df.KitchenQual.fillna(df.KitchenQual.mode()[0], inplace=True)
df.Electrical.fillna(df.Electrical.mode()[0], inplace=True)
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2919 entries, 0 to 2918 Data columns (total 81 columns): 1stFlrSF 2919 non-null int64 2ndFlrSF 2919 non-null int64 3SsnPorch 2919 non-null int64 Alley 2919 non-null object BedroomAbvGr 2919 non-null int64 BldgType 2919 non-null object BsmtCond 2919 non-null object BsmtExposure 2919 non-null object BsmtFinSF1 2919 non-null float64 BsmtFinSF2 2919 non-null float64 BsmtFinType1 2919 non-null object BsmtFinType2 2919 non-null object BsmtFullBath 2919 non-null float64 BsmtHalfBath 2919 non-null float64 BsmtQual 2919 non-null object BsmtUnfSF 2919 non-null float64 CentralAir 2919 non-null object Condition1 2919 non-null object Condition2 2919 non-null object Electrical 2919 non-null object EnclosedPorch 2919 non-null int64 ExterCond 2919 non-null object ExterQual 2919 non-null object Exterior1st 2919 non-null object Exterior2nd 2919 non-null object Fence 2919 non-null object FireplaceQu 2919 non-null object Fireplaces 2919 non-null int64 Foundation 2919 non-null object FullBath 2919 non-null int64 Functional 2919 non-null object GarageArea 2919 non-null float64 GarageCars 2919 non-null float64 GarageCond 2919 non-null object GarageFinish 2919 non-null object GarageQual 2919 non-null object GarageType 2919 non-null object GarageYrBlt 2919 non-null float64 GrLivArea 2919 non-null int64 HalfBath 2919 non-null int64 Heating 2919 non-null object HeatingQC 2919 non-null object HouseStyle 2919 non-null object Id 2919 non-null int64 KitchenAbvGr 2919 non-null int64 KitchenQual 2919 non-null object LandContour 2919 non-null object LandSlope 2919 non-null object LotArea 2919 non-null int64 LotConfig 2919 non-null object LotFrontage 2919 non-null float64 LotShape 2919 non-null object LowQualFinSF 2919 non-null int64 MSSubClass 2919 non-null int64 MSZoning 2915 non-null object MasVnrArea 2919 non-null float64 MasVnrType 2919 non-null object MiscFeature 2919 non-null object MiscVal 2919 non-null int64 MoSold 2919 non-null int64 Neighborhood 2919 non-null object OpenPorchSF 2919 non-null int64 OverallCond 2919 non-null int64 OverallQual 2919 non-null int64 PavedDrive 2919 non-null object PoolArea 2919 non-null int64 PoolQC 2919 non-null object RoofMatl 2919 non-null object RoofStyle 2919 non-null object SaleCondition 2919 non-null object SalePrice 1460 non-null float64 SaleType 2919 non-null object ScreenPorch 2919 non-null int64 Street 2919 non-null object TotRmsAbvGrd 2919 non-null int64 TotalBsmtSF 2919 non-null float64 Utilities 2919 non-null object WoodDeckSF 2919 non-null int64 YearBuilt 2919 non-null int64 YearRemodAdd 2919 non-null int64 YrSold 2919 non-null int64 dtypes: float64(12), int64(26), object(43) memory usage: 1.8+ MB
set_option('precision', 1)
df.describe()
1stFlrSF | 2ndFlrSF | 3SsnPorch | BedroomAbvGr | BsmtFinSF1 | BsmtFinSF2 | BsmtFullBath | BsmtHalfBath | BsmtUnfSF | EnclosedPorch | Fireplaces | FullBath | GarageArea | GarageCars | GarageYrBlt | GrLivArea | HalfBath | Id | KitchenAbvGr | LotArea | LotFrontage | LowQualFinSF | MSSubClass | MasVnrArea | MiscVal | MoSold | OpenPorchSF | OverallCond | OverallQual | PoolArea | SalePrice | ScreenPorch | TotRmsAbvGrd | TotalBsmtSF | WoodDeckSF | YearBuilt | YearRemodAdd | YrSold | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
count | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2.9e+03 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 1460.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 | 2919.0 |
mean | 1159.6 | 336.5 | 2.6 | 2.9 | 441.3 | 49.6 | 0.4 | 6.1e-02 | 560.6 | 23.1 | 0.6 | 1.6 | 472.7 | 1.8 | 1870.4 | 1500.8 | 0.4 | 1460.0 | 1.0 | 10168.1 | 69.1 | 4.7 | 57.1 | 101.4 | 50.8 | 6.2 | 47.5 | 5.6 | 6.1 | 2.3 | 12.0 | 16.1 | 6.5 | 1051.4 | 93.7 | 1971.3 | 1984.3 | 2007.8 |
std | 392.4 | 428.7 | 25.2 | 0.8 | 455.6 | 169.2 | 0.5 | 2.5e-01 | 439.6 | 64.2 | 0.6 | 0.6 | 215.5 | 0.8 | 449.7 | 506.1 | 0.5 | 842.8 | 0.2 | 7887.0 | 21.3 | 46.4 | 42.5 | 178.9 | 567.4 | 2.7 | 67.6 | 1.1 | 1.4 | 35.7 | 0.4 | 56.2 | 1.6 | 441.1 | 126.5 | 30.3 | 20.9 | 1.3 |
min | 334.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0e+00 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 0.0 | 334.0 | 0.0 | 1.0 | 0.0 | 1300.0 | 21.0 | 0.0 | 20.0 | 0.0 | 0.0 | 1.0 | 0.0 | 1.0 | 1.0 | 0.0 | 10.5 | 0.0 | 2.0 | 0.0 | 0.0 | 1872.0 | 1950.0 | 2006.0 |
25% | 876.0 | 0.0 | 0.0 | 2.0 | 0.0 | 0.0 | 0.0 | 0.0e+00 | 220.0 | 0.0 | 0.0 | 1.0 | 320.0 | 1.0 | 1957.0 | 1126.0 | 0.0 | 730.5 | 1.0 | 7478.0 | 60.0 | 0.0 | 20.0 | 0.0 | 0.0 | 4.0 | 0.0 | 5.0 | 5.0 | 0.0 | 11.8 | 0.0 | 5.0 | 793.0 | 0.0 | 1953.5 | 1965.0 | 2007.0 |
50% | 1082.0 | 0.0 | 0.0 | 3.0 | 368.0 | 0.0 | 0.0 | 0.0e+00 | 467.0 | 0.0 | 1.0 | 2.0 | 480.0 | 2.0 | 1977.0 | 1444.0 | 0.0 | 1460.0 | 1.0 | 9453.0 | 68.0 | 0.0 | 50.0 | 0.0 | 0.0 | 6.0 | 26.0 | 5.0 | 6.0 | 0.0 | 12.0 | 0.0 | 6.0 | 989.0 | 0.0 | 1973.0 | 1993.0 | 2008.0 |
75% | 1387.5 | 704.0 | 0.0 | 3.0 | 733.0 | 0.0 | 1.0 | 0.0e+00 | 805.0 | 0.0 | 1.0 | 2.0 | 576.0 | 2.0 | 2001.0 | 1743.5 | 1.0 | 2189.5 | 1.0 | 11570.0 | 78.0 | 0.0 | 70.0 | 163.5 | 0.0 | 8.0 | 70.0 | 6.0 | 7.0 | 0.0 | 12.3 | 0.0 | 7.0 | 1302.0 | 168.0 | 2001.0 | 2004.0 | 2009.0 |
max | 5095.0 | 2065.0 | 508.0 | 8.0 | 5644.0 | 1526.0 | 3.0 | 2.0e+00 | 2336.0 | 1012.0 | 4.0 | 4.0 | 1488.0 | 5.0 | 2207.0 | 5642.0 | 2.0 | 2919.0 | 3.0 | 215245.0 | 313.0 | 1064.0 | 190.0 | 1600.0 | 17000.0 | 12.0 | 742.0 | 9.0 | 10.0 | 800.0 | 13.5 | 576.0 | 15.0 | 6110.0 | 1424.0 | 2010.0 | 2010.0 | 2010.0 |
#Correlation
set_option('precision', 2)
df.corr(method='pearson')
1stFlrSF | 2ndFlrSF | 3SsnPorch | BedroomAbvGr | BsmtFinSF1 | BsmtFinSF2 | BsmtFullBath | BsmtHalfBath | BsmtUnfSF | EnclosedPorch | Fireplaces | FullBath | GarageArea | GarageCars | GarageYrBlt | GrLivArea | HalfBath | Id | KitchenAbvGr | LotArea | LotFrontage | LowQualFinSF | MSSubClass | MasVnrArea | MiscVal | MoSold | OpenPorchSF | OverallCond | OverallQual | PoolArea | SalePrice | ScreenPorch | TotRmsAbvGrd | TotalBsmtSF | WoodDeckSF | YearBuilt | YearRemodAdd | YrSold | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1stFlrSF | 1.00e+00 | -2.50e-01 | 4.41e-02 | 1.08e-01 | 4.58e-01 | 8.44e-02 | 2.58e-01 | 1.05e-02 | 2.97e-01 | -6.58e-02 | 4.08e-01 | 3.73e-01 | 4.92e-01 | 0.44 | 1.48e-01 | 5.63e-01 | -1.04e-01 | -8.68e-03 | 7.61e-02 | 3.32e-01 | 4.23e-01 | -1.27e-02 | -2.49e-01 | 3.92e-01 | 9.31e-02 | 4.01e-02 | 2.39e-01 | -1.57e-01 | 4.79e-01 | 1.22e-01 | 5.97e-01 | 9.84e-02 | 0.39 | 0.80 | 2.27e-01 | 3.11e-01 | 2.42e-01 | -1.34e-02 |
2ndFlrSF | -2.50e-01 | 1.00e+00 | -3.25e-02 | 5.04e-01 | -1.62e-01 | -9.77e-02 | -1.61e-01 | -6.01e-02 | -3.24e-05 | 5.46e-02 | 1.70e-01 | 4.03e-01 | 1.27e-01 | 0.18 | 4.25e-02 | 6.55e-01 | 6.11e-01 | -2.23e-02 | 6.90e-02 | 3.15e-02 | 2.32e-02 | 1.78e-02 | 3.09e-01 | 1.19e-01 | -5.30e-03 | 1.42e-02 | 1.85e-01 | 5.49e-03 | 2.46e-01 | 4.45e-02 | 3.19e-01 | 1.11e-02 | 0.58 | -0.20 | 8.99e-02 | 1.76e-02 | 1.59e-01 | -1.92e-02 |
3SsnPorch | 4.41e-02 | -3.25e-02 | 1.00e+00 | -4.83e-02 | 5.09e-02 | -2.33e-02 | 2.74e-02 | 2.69e-02 | -5.76e-03 | -3.28e-02 | 1.89e-02 | 1.52e-02 | 2.95e-02 | 0.02 | 1.54e-02 | 6.27e-03 | -2.34e-02 | -4.65e-02 | -2.15e-02 | 1.60e-02 | 2.49e-02 | -4.54e-03 | -3.75e-02 | 1.41e-02 | -7.88e-04 | 2.74e-02 | -9.39e-03 | 4.37e-02 | 1.87e-02 | -6.53e-03 | 5.49e-02 | -2.95e-02 | -0.03 | 0.04 | -3.94e-03 | 1.60e-02 | 3.74e-02 | 2.26e-02 |
BedroomAbvGr | 1.08e-01 | 5.04e-01 | -4.83e-02 | 1.00e+00 | -1.13e-01 | -3.11e-02 | -1.55e-01 | 1.87e-02 | 1.84e-01 | 4.99e-02 | 8.68e-02 | 3.56e-01 | 7.37e-02 | 0.09 | -3.76e-02 | 5.17e-01 | 2.45e-01 | 3.07e-03 | 2.41e-01 | 1.33e-01 | 2.12e-01 | 7.04e-02 | -8.80e-03 | 7.82e-02 | 2.45e-04 | 5.60e-02 | 8.62e-02 | -8.48e-03 | 7.31e-02 | 3.65e-02 | 2.09e-01 | 7.25e-03 | 0.67 | 0.05 | 3.16e-02 | -5.31e-02 | -2.19e-02 | -2.05e-02 |
BsmtFinSF1 | 4.58e-01 | -1.62e-01 | 5.09e-02 | -1.13e-01 | 1.00e+00 | -5.49e-02 | 6.39e-01 | 7.85e-02 | -4.77e-01 | -9.96e-02 | 2.93e-01 | 8.18e-02 | 3.10e-01 | 0.26 | 1.25e-01 | 2.12e-01 | -7.06e-03 | -1.72e-02 | -8.63e-02 | 1.94e-01 | 2.01e-01 | -6.60e-02 | -6.40e-02 | 3.02e-01 | 9.33e-02 | -6.71e-04 | 1.24e-01 | -5.08e-02 | 2.82e-01 | 8.45e-02 | 3.72e-01 | 9.69e-02 | 0.05 | 0.54 | 2.24e-01 | 2.80e-01 | 1.53e-01 | 2.25e-02 |
BsmtFinSF2 | 8.44e-02 | -9.77e-02 | -2.33e-02 | -3.11e-02 | -5.49e-02 | 1.00e+00 | 1.63e-01 | 9.95e-02 | -2.38e-01 | 3.28e-02 | 6.57e-02 | -7.53e-02 | 3.45e-03 | -0.01 | 4.87e-02 | -1.77e-02 | -3.24e-02 | 1.82e-02 | -3.78e-02 | 8.41e-02 | 4.03e-02 | -4.91e-03 | -7.24e-02 | -1.46e-02 | -5.13e-03 | -9.51e-03 | -5.80e-03 | 4.14e-02 | -4.26e-02 | 4.45e-02 | 4.83e-03 | 6.33e-02 | -0.05 | 0.09 | 9.85e-02 | -2.75e-02 | -6.20e-02 | 8.87e-03 |
BsmtFullBath | 2.58e-01 | -1.61e-01 | 2.74e-02 | -1.55e-01 | 6.39e-01 | 1.63e-01 | 1.00e+00 | -1.49e-01 | -3.98e-01 | -6.82e-02 | 1.69e-01 | -1.92e-02 | 1.85e-01 | 0.16 | 6.35e-02 | 5.93e-02 | -3.36e-02 | -3.18e-04 | -1.78e-02 | 1.27e-01 | 1.01e-01 | -4.70e-02 | 1.04e-02 | 1.41e-01 | -4.58e-03 | -3.42e-03 | 7.98e-02 | -4.28e-02 | 1.65e-01 | 4.40e-02 | 2.36e-01 | 5.33e-02 | -0.04 | 0.33 | 1.86e-01 | 2.12e-01 | 1.35e-01 | 4.52e-02 |
BsmtHalfBath | 1.05e-02 | -6.01e-02 | 2.69e-02 | 1.87e-02 | 7.85e-02 | 9.95e-02 | -1.49e-01 | 1.00e+00 | -1.07e-01 | -9.61e-03 | 3.91e-02 | -4.71e-02 | -2.12e-02 | -0.03 | 1.50e-03 | -4.41e-02 | -5.85e-02 | 1.02e-02 | -6.49e-02 | 2.58e-02 | -2.39e-02 | -1.35e-02 | -1.73e-03 | 1.62e-02 | 3.69e-02 | 2.31e-02 | -3.51e-02 | 8.39e-02 | -4.05e-02 | 6.69e-02 | -5.15e-03 | 4.21e-02 | -0.05 | 0.01 | 5.17e-02 | -3.02e-02 | -4.62e-02 | -2.01e-02 |
BsmtUnfSF | 2.97e-01 | -3.24e-05 | -5.76e-03 | 1.84e-01 | -4.77e-01 | -2.38e-01 | -3.98e-01 | -1.07e-01 | 1.00e+00 | 5.16e-03 | 5.22e-03 | 2.74e-01 | 1.65e-01 | 0.18 | 1.42e-02 | 2.34e-01 | -3.55e-02 | -1.48e-02 | 6.51e-02 | 2.16e-02 | 1.06e-01 | 4.69e-02 | -1.26e-01 | 8.82e-02 | -1.05e-02 | 2.30e-02 | 1.20e-01 | -1.39e-01 | 2.76e-01 | -3.22e-02 | 2.22e-01 | -4.90e-02 | 0.25 | 0.41 | -3.90e-02 | 1.31e-01 | 1.66e-01 | -3.81e-02 |
EnclosedPorch | -6.58e-02 | 5.46e-02 | -3.28e-02 | 4.99e-02 | -9.96e-02 | 3.28e-02 | -6.82e-02 | -9.61e-03 | 5.16e-03 | 1.00e+00 | 9.71e-04 | -1.19e-01 | -1.08e-01 | -0.13 | -7.96e-02 | 3.27e-03 | -8.20e-02 | 2.16e-02 | 2.76e-02 | 2.10e-02 | 1.10e-02 | 8.72e-02 | -2.09e-02 | -1.11e-01 | 8.65e-03 | -2.10e-02 | -5.97e-02 | 7.10e-02 | -1.39e-01 | 9.25e-02 | -1.49e-01 | -6.44e-02 | 0.02 | -0.09 | -1.19e-01 | -3.74e-01 | -2.20e-01 | -1.13e-03 |
Fireplaces | 4.08e-01 | 1.70e-01 | 1.89e-02 | 8.68e-02 | 2.93e-01 | 6.57e-02 | 1.69e-01 | 3.91e-02 | 5.22e-03 | 9.71e-04 | 1.00e+00 | 2.36e-01 | 2.95e-01 | 0.32 | 1.87e-01 | 4.59e-01 | 2.06e-01 | -3.52e-02 | -1.08e-01 | 2.61e-01 | 2.32e-01 | -6.63e-03 | -5.52e-02 | 2.73e-01 | 8.53e-03 | 3.24e-02 | 1.59e-01 | -3.10e-02 | 3.91e-01 | 9.91e-02 | 4.89e-01 | 1.70e-01 | 0.31 | 0.33 | 2.28e-01 | 1.71e-01 | 1.34e-01 | -6.75e-03 |
FullBath | 3.73e-01 | 4.03e-01 | 1.52e-02 | 3.56e-01 | 8.18e-02 | -7.53e-02 | -1.92e-02 | -4.71e-02 | 2.74e-01 | -1.19e-01 | 2.36e-01 | 1.00e+00 | 4.08e-01 | 0.48 | 1.20e-01 | 6.30e-01 | 1.58e-01 | -9.95e-03 | 1.71e-01 | 1.26e-01 | 1.64e-01 | -2.88e-03 | 1.39e-01 | 2.54e-01 | -1.00e-02 | 4.63e-02 | 2.60e-01 | -2.16e-01 | 5.28e-01 | 2.81e-02 | 5.95e-01 | -1.59e-02 | 0.53 | 0.33 | 1.81e-01 | 4.71e-01 | 4.58e-01 | -4.88e-03 |
GarageArea | 4.92e-01 | 1.27e-01 | 2.95e-02 | 7.37e-02 | 3.10e-01 | 3.45e-03 | 1.85e-01 | -2.12e-02 | 1.65e-01 | -1.08e-01 | 2.95e-01 | 4.08e-01 | 1.00e+00 | 0.89 | 5.49e-01 | 4.85e-01 | 1.79e-01 | -9.85e-03 | -5.76e-02 | 2.13e-01 | 3.38e-01 | -5.34e-02 | -1.04e-01 | 3.71e-01 | 8.51e-03 | 4.12e-02 | 2.33e-01 | -1.54e-01 | 5.65e-01 | 5.31e-02 | 6.51e-01 | 6.26e-02 | 0.33 | 0.49 | 2.37e-01 | 4.81e-01 | 3.76e-01 | -1.30e-02 |
GarageCars | 4.41e-01 | 1.81e-01 | 2.34e-02 | 9.27e-02 | 2.55e-01 | -1.45e-02 | 1.62e-01 | -3.32e-02 | 1.81e-01 | -1.35e-01 | 3.22e-01 | 4.79e-01 | 8.90e-01 | 1.00 | 5.80e-01 | 4.89e-01 | 2.35e-01 | -1.12e-02 | -3.69e-02 | 1.80e-01 | 2.91e-01 | -6.72e-02 | -4.68e-02 | 3.58e-01 | -1.68e-02 | 5.18e-02 | 2.04e-01 | -1.82e-01 | 6.01e-01 | 3.04e-02 | 6.81e-01 | 4.33e-02 | 0.36 | 0.44 | 2.40e-01 | 5.39e-01 | 4.25e-01 | -2.24e-02 |
GarageYrBlt | 1.48e-01 | 4.25e-02 | 1.54e-02 | -3.76e-02 | 1.25e-01 | 4.87e-02 | 6.35e-02 | 1.50e-03 | 1.42e-02 | -7.96e-02 | 1.87e-01 | 1.20e-01 | 5.49e-01 | 0.58 | 1.00e+00 | 1.43e-01 | 9.75e-02 | 4.34e-04 | -1.87e-01 | 7.38e-02 | 9.93e-02 | -8.84e-02 | -1.05e-01 | 1.29e-01 | -5.77e-03 | 2.01e-02 | 5.03e-02 | 1.53e-02 | 2.78e-01 | 1.43e-02 | 3.49e-01 | 5.63e-02 | 0.06 | 0.16 | 1.24e-01 | 2.60e-01 | 1.47e-01 | -1.05e-02 |
GrLivArea | 5.63e-01 | 6.55e-01 | 6.27e-03 | 5.17e-01 | 2.12e-01 | -1.77e-02 | 5.93e-02 | -4.41e-02 | 2.34e-01 | 3.27e-03 | 4.59e-01 | 6.30e-01 | 4.85e-01 | 0.49 | 1.43e-01 | 1.00e+00 | 4.34e-01 | -2.90e-02 | 1.18e-01 | 2.85e-01 | 3.48e-01 | 9.69e-02 | 7.17e-02 | 4.00e-01 | 6.71e-02 | 4.42e-02 | 3.42e-01 | -1.17e-01 | 5.75e-01 | 1.35e-01 | 7.01e-01 | 8.63e-02 | 0.81 | 0.45 | 2.51e-01 | 2.43e-01 | 3.17e-01 | -2.69e-02 |
HalfBath | -1.04e-01 | 6.11e-01 | -2.34e-02 | 2.45e-01 | -7.06e-03 | -3.24e-02 | -3.36e-02 | -5.85e-02 | -3.55e-02 | -8.20e-02 | 2.06e-01 | 1.58e-01 | 1.79e-01 | 0.23 | 9.75e-02 | 4.34e-01 | 1.00e+00 | -1.54e-02 | -4.27e-02 | 3.42e-02 | 3.40e-02 | -3.95e-02 | 1.79e-01 | 1.88e-01 | 2.66e-02 | -1.14e-03 | 1.82e-01 | -8.86e-02 | 2.73e-01 | 1.42e-03 | 3.14e-01 | 3.56e-02 | 0.35 | -0.06 | 1.17e-01 | 2.70e-01 | 2.11e-01 | 1.59e-03 |
Id | -8.68e-03 | -2.23e-02 | -4.65e-02 | 3.07e-03 | -1.72e-02 | 1.82e-02 | -3.18e-04 | 1.02e-02 | -1.48e-02 | 2.16e-02 | -3.52e-02 | -9.95e-03 | -9.85e-03 | -0.01 | 4.34e-04 | -2.90e-02 | -1.54e-02 | 1.00e+00 | -1.17e-02 | -4.07e-02 | -2.47e-02 | -3.78e-02 | 8.93e-03 | -2.67e-02 | 8.24e-03 | 6.45e-03 | 9.96e-03 | -2.84e-03 | -2.98e-02 | 1.43e-02 | -1.79e-02 | 2.22e-02 | -0.03 | -0.03 | -7.06e-03 | -1.66e-02 | -5.04e-02 | -2.56e-01 |
KitchenAbvGr | 7.61e-02 | 6.90e-02 | -2.15e-02 | 2.41e-01 | -8.63e-02 | -3.78e-02 | -1.78e-02 | -6.49e-02 | 6.51e-02 | 2.76e-02 | -1.08e-01 | 1.71e-01 | -5.76e-02 | -0.04 | -1.87e-01 | 1.18e-01 | -4.27e-02 | -1.17e-02 | 1.00e+00 | -2.09e-02 | 4.91e-03 | 4.38e-04 | 2.60e-01 | -5.11e-02 | 2.51e-02 | 3.55e-02 | -6.82e-02 | -8.67e-02 | -1.59e-01 | -1.31e-02 | -1.48e-01 | -5.66e-02 | 0.29 | -0.04 | -8.74e-02 | -1.38e-01 | -1.42e-01 | 3.52e-02 |
LotArea | 3.32e-01 | 3.15e-02 | 1.60e-02 | 1.33e-01 | 1.94e-01 | 8.41e-02 | 1.27e-01 | 2.58e-02 | 2.16e-02 | 2.10e-02 | 2.61e-01 | 1.26e-01 | 2.13e-01 | 0.18 | 7.38e-02 | 2.85e-01 | 3.42e-02 | -4.07e-02 | -2.09e-02 | 1.00e+00 | 3.61e-01 | 5.54e-04 | -2.02e-01 | 1.25e-01 | 6.90e-02 | 4.16e-03 | 1.05e-01 | -3.56e-02 | 1.01e-01 | 9.37e-02 | 2.57e-01 | 5.44e-02 | 0.21 | 0.25 | 1.58e-01 | 2.41e-02 | 2.16e-02 | -2.42e-02 |
LotFrontage | 4.23e-01 | 2.32e-02 | 2.49e-02 | 2.12e-01 | 2.01e-01 | 4.03e-02 | 1.01e-01 | -2.39e-02 | 1.06e-01 | 1.10e-02 | 2.32e-01 | 1.64e-01 | 3.38e-01 | 0.29 | 9.93e-02 | 3.48e-01 | 3.40e-02 | -2.47e-02 | 4.91e-03 | 3.61e-01 | 1.00e+00 | 5.14e-03 | -3.89e-01 | 2.00e-01 | 3.52e-02 | 1.07e-02 | 1.52e-01 | -6.81e-02 | 2.04e-01 | 1.61e-01 | 3.34e-01 | 6.94e-02 | 0.32 | 0.33 | 1.04e-01 | 1.15e-01 | 8.60e-02 | -7.56e-03 |
LowQualFinSF | -1.27e-02 | 1.78e-02 | -4.54e-03 | 7.04e-02 | -6.60e-02 | -4.91e-03 | -4.70e-02 | -1.35e-02 | 4.69e-02 | 8.72e-02 | -6.63e-03 | -2.88e-03 | -5.34e-02 | -0.07 | -8.84e-02 | 9.69e-02 | -3.95e-02 | -3.78e-02 | 4.38e-04 | 5.54e-04 | 5.14e-03 | 1.00e+00 | 2.65e-02 | -5.74e-02 | -5.97e-03 | 1.15e-02 | -6.92e-04 | 9.05e-03 | -4.84e-02 | 3.52e-02 | -3.80e-02 | 6.84e-03 | 0.10 | -0.02 | -1.56e-02 | -1.44e-01 | -6.04e-02 | -2.26e-03 |
MSSubClass | -2.49e-01 | 3.09e-01 | -3.75e-02 | -8.80e-03 | -6.40e-02 | -7.24e-02 | 1.04e-02 | -1.73e-03 | -1.26e-01 | -2.09e-02 | -5.52e-02 | 1.39e-01 | -1.04e-01 | -0.05 | -1.05e-01 | 7.17e-02 | 1.79e-01 | 8.93e-03 | 2.60e-01 | -2.02e-01 | -3.89e-01 | 2.65e-02 | 1.00e+00 | 6.31e-03 | -2.89e-02 | -1.23e-03 | -1.59e-02 | -6.56e-02 | 3.36e-02 | -3.08e-03 | -7.40e-02 | -4.92e-02 | 0.04 | -0.22 | -1.77e-02 | 3.44e-02 | 4.33e-02 | -1.50e-02 |
MasVnrArea | 3.92e-01 | 1.19e-01 | 1.41e-02 | 7.82e-02 | 3.02e-01 | -1.46e-02 | 1.41e-01 | 1.62e-02 | 8.82e-02 | -1.11e-01 | 2.73e-01 | 2.54e-01 | 3.71e-01 | 0.36 | 1.29e-01 | 4.00e-01 | 1.88e-01 | -2.67e-02 | -5.11e-02 | 1.25e-01 | 2.00e-01 | -5.74e-02 | 6.31e-03 | 1.00e+00 | 4.52e-02 | -2.22e-03 | 1.41e-01 | -1.33e-01 | 4.27e-01 | 4.79e-03 | 4.27e-01 | 6.64e-02 | 0.28 | 0.39 | 1.67e-01 | 3.08e-01 | 1.92e-01 | -1.77e-02 |
MiscVal | 9.31e-02 | -5.30e-03 | -7.88e-04 | 2.45e-04 | 9.33e-02 | -5.13e-03 | -4.58e-03 | 3.69e-02 | -1.05e-02 | 8.65e-03 | 8.53e-03 | -1.00e-02 | 8.51e-03 | -0.02 | -5.77e-03 | 6.71e-02 | 2.66e-02 | 8.24e-03 | 2.51e-02 | 6.90e-02 | 3.52e-02 | -5.97e-03 | -2.89e-02 | 4.52e-02 | 1.00e+00 | 7.44e-03 | 7.74e-02 | 3.40e-02 | 5.56e-03 | 1.19e-02 | -2.00e-02 | 7.07e-03 | 0.06 | 0.08 | 5.69e-02 | -1.09e-02 | -3.12e-03 | 8.44e-03 |
MoSold | 4.01e-02 | 1.42e-02 | 2.74e-02 | 5.60e-02 | -6.71e-04 | -9.51e-03 | -3.42e-03 | 2.31e-02 | 2.30e-02 | -2.10e-02 | 3.24e-02 | 4.63e-02 | 4.12e-02 | 0.05 | 2.01e-02 | 4.42e-02 | -1.14e-03 | 6.45e-03 | 3.55e-02 | 4.16e-03 | 1.07e-02 | 1.15e-02 | -1.23e-03 | -2.22e-03 | 7.44e-03 | 1.00e+00 | 3.42e-02 | -6.26e-03 | 3.04e-02 | -4.22e-02 | 5.73e-02 | 2.85e-02 | 0.05 | 0.02 | 1.81e-02 | 1.39e-02 | 1.77e-02 | -1.54e-01 |
OpenPorchSF | 2.39e-01 | 1.85e-01 | -9.39e-03 | 8.62e-02 | 1.24e-01 | -5.80e-03 | 7.98e-02 | -3.51e-02 | 1.20e-01 | -5.97e-02 | 1.59e-01 | 2.60e-01 | 2.33e-01 | 0.20 | 5.03e-02 | 3.42e-01 | 1.82e-01 | 9.96e-03 | -6.82e-02 | 1.05e-01 | 1.52e-01 | -6.92e-04 | -1.59e-02 | 1.41e-01 | 7.74e-02 | 3.42e-02 | 1.00e+00 | -6.90e-02 | 2.98e-01 | 6.42e-02 | 3.21e-01 | 4.78e-02 | 0.24 | 0.25 | 3.85e-02 | 1.99e-01 | 2.42e-01 | -3.74e-02 |
OverallCond | -1.57e-01 | 5.49e-03 | 4.37e-02 | -8.48e-03 | -5.08e-02 | 4.14e-02 | -4.28e-02 | 8.39e-02 | -1.39e-01 | 7.10e-02 | -3.10e-02 | -2.16e-01 | -1.54e-01 | -0.18 | 1.53e-02 | -1.17e-01 | -8.86e-02 | -2.84e-03 | -8.67e-02 | -3.56e-02 | -6.81e-02 | 9.05e-03 | -6.56e-02 | -1.33e-01 | 3.40e-02 | -6.26e-03 | -6.90e-02 | 1.00e+00 | -9.38e-02 | -1.69e-02 | -3.69e-02 | 4.37e-02 | -0.09 | -0.17 | 2.01e-02 | -3.68e-01 | 4.77e-02 | 3.01e-02 |
OverallQual | 4.79e-01 | 2.46e-01 | 1.87e-02 | 7.31e-02 | 2.82e-01 | -4.26e-02 | 1.65e-01 | -4.05e-02 | 2.76e-01 | -1.39e-01 | 3.91e-01 | 5.28e-01 | 5.65e-01 | 0.60 | 2.78e-01 | 5.75e-01 | 2.73e-01 | -2.98e-02 | -1.59e-01 | 1.01e-01 | 2.04e-01 | -4.84e-02 | 3.36e-02 | 4.27e-01 | 5.56e-03 | 3.04e-02 | 2.98e-01 | -9.38e-02 | 1.00e+00 | 3.07e-02 | 8.17e-01 | 4.29e-02 | 0.39 | 0.55 | 2.55e-01 | 5.98e-01 | 5.72e-01 | -1.96e-02 |
PoolArea | 1.22e-01 | 4.45e-02 | -6.53e-03 | 3.65e-02 | 8.45e-02 | 4.45e-02 | 4.40e-02 | 6.69e-02 | -3.22e-02 | 9.25e-02 | 9.91e-02 | 2.81e-02 | 5.31e-02 | 0.03 | 1.43e-02 | 1.35e-01 | 1.42e-03 | 1.43e-02 | -1.31e-02 | 9.37e-02 | 1.61e-01 | 3.52e-02 | -3.08e-03 | 4.79e-03 | 1.19e-02 | -4.22e-02 | 6.42e-02 | -1.69e-02 | 3.07e-02 | 1.00e+00 | 6.98e-02 | 2.63e-02 | 0.07 | 0.07 | 9.42e-02 | 2.30e-03 | -1.14e-02 | -5.28e-02 |
SalePrice | 5.97e-01 | 3.19e-01 | 5.49e-02 | 2.09e-01 | 3.72e-01 | 4.83e-03 | 2.36e-01 | -5.15e-03 | 2.22e-01 | -1.49e-01 | 4.89e-01 | 5.95e-01 | 6.51e-01 | 0.68 | 3.49e-01 | 7.01e-01 | 3.14e-01 | -1.79e-02 | -1.48e-01 | 2.57e-01 | 3.34e-01 | -3.80e-02 | -7.40e-02 | 4.27e-01 | -2.00e-02 | 5.73e-02 | 3.21e-01 | -3.69e-02 | 8.17e-01 | 6.98e-02 | 1.00e+00 | 1.21e-01 | 0.53 | 0.61 | 3.34e-01 | 5.87e-01 | 5.66e-01 | -3.73e-02 |
ScreenPorch | 9.84e-02 | 1.11e-02 | -2.95e-02 | 7.25e-03 | 9.69e-02 | 6.33e-02 | 5.33e-02 | 4.21e-02 | -4.90e-02 | -6.44e-02 | 1.70e-01 | -1.59e-02 | 6.26e-02 | 0.04 | 5.63e-02 | 8.63e-02 | 3.56e-02 | 2.22e-02 | -5.66e-02 | 5.44e-02 | 6.94e-02 | 6.84e-03 | -4.92e-02 | 6.64e-02 | 7.07e-03 | 2.85e-02 | 4.78e-02 | 4.37e-02 | 4.29e-02 | 2.63e-02 | 1.21e-01 | 1.00e+00 | 0.03 | 0.08 | -5.21e-02 | -4.10e-02 | -4.69e-02 | -6.63e-03 |
TotRmsAbvGrd | 3.92e-01 | 5.85e-01 | -2.58e-02 | 6.70e-01 | 5.26e-02 | -4.82e-02 | -3.92e-02 | -4.96e-02 | 2.48e-01 | 1.53e-02 | 3.11e-01 | 5.27e-01 | 3.29e-01 | 0.36 | 5.93e-02 | 8.08e-01 | 3.46e-01 | -2.94e-02 | 2.95e-01 | 2.14e-01 | 3.21e-01 | 1.02e-01 | 4.05e-02 | 2.77e-01 | 6.09e-02 | 4.51e-02 | 2.38e-01 | -9.20e-02 | 3.90e-01 | 7.21e-02 | 5.34e-01 | 3.23e-02 | 1.00 | 0.28 | 1.57e-01 | 1.14e-01 | 1.98e-01 | -3.24e-02 |
TotalBsmtSF | 8.01e-01 | -2.05e-01 | 3.79e-02 | 5.42e-02 | 5.37e-01 | 8.96e-02 | 3.26e-01 | 1.28e-02 | 4.13e-01 | -8.51e-02 | 3.33e-01 | 3.28e-01 | 4.86e-01 | 0.44 | 1.62e-01 | 4.46e-01 | -5.50e-02 | -2.55e-02 | -3.88e-02 | 2.54e-01 | 3.29e-01 | -2.33e-02 | -2.19e-01 | 3.94e-01 | 8.40e-02 | 1.85e-02 | 2.46e-01 | -1.75e-01 | 5.50e-01 | 7.22e-02 | 6.12e-01 | 7.55e-02 | 0.28 | 1.00 | 2.30e-01 | 4.09e-01 | 2.99e-01 | -1.13e-02 |
WoodDeckSF | 2.27e-01 | 8.99e-02 | -3.94e-03 | 3.16e-02 | 2.24e-01 | 9.85e-02 | 1.86e-01 | 5.17e-02 | -3.90e-02 | -1.19e-01 | 2.28e-01 | 1.81e-01 | 2.37e-01 | 0.24 | 1.24e-01 | 2.51e-01 | 1.17e-01 | -7.06e-03 | -8.74e-02 | 1.58e-01 | 1.04e-01 | -1.56e-02 | -1.77e-02 | 1.67e-01 | 5.69e-02 | 1.81e-02 | 3.85e-02 | 2.01e-02 | 2.55e-01 | 9.42e-02 | 3.34e-01 | -5.21e-02 | 0.16 | 0.23 | 1.00e+00 | 2.29e-01 | 2.19e-01 | -1.80e-04 |
YearBuilt | 3.11e-01 | 1.76e-02 | 1.60e-02 | -5.31e-02 | 2.80e-01 | -2.75e-02 | 2.12e-01 | -3.02e-02 | 1.31e-01 | -3.74e-01 | 1.71e-01 | 4.71e-01 | 4.81e-01 | 0.54 | 2.60e-01 | 2.43e-01 | 2.70e-01 | -1.66e-02 | -1.38e-01 | 2.41e-02 | 1.15e-01 | -1.44e-01 | 3.44e-02 | 3.08e-01 | -1.09e-02 | 1.39e-02 | 1.99e-01 | -3.68e-01 | 5.98e-01 | 2.30e-03 | 5.87e-01 | -4.10e-02 | 0.11 | 0.41 | 2.29e-01 | 1.00e+00 | 6.12e-01 | -1.23e-02 |
YearRemodAdd | 2.42e-01 | 1.59e-01 | 3.74e-02 | -2.19e-02 | 1.53e-01 | -6.20e-02 | 1.35e-01 | -4.62e-02 | 1.66e-01 | -2.20e-01 | 1.34e-01 | 4.58e-01 | 3.76e-01 | 0.43 | 1.47e-01 | 3.17e-01 | 2.11e-01 | -5.04e-02 | -1.42e-01 | 2.16e-02 | 8.60e-02 | -6.04e-02 | 4.33e-02 | 1.92e-01 | -3.12e-03 | 1.77e-02 | 2.42e-01 | 4.77e-02 | 5.72e-01 | -1.14e-02 | 5.66e-01 | -4.69e-02 | 0.20 | 0.30 | 2.19e-01 | 6.12e-01 | 1.00e+00 | 3.32e-02 |
YrSold | -1.34e-02 | -1.92e-02 | 2.26e-02 | -2.05e-02 | 2.25e-02 | 8.87e-03 | 4.52e-02 | -2.01e-02 | -3.81e-02 | -1.13e-03 | -6.75e-03 | -4.88e-03 | -1.30e-02 | -0.02 | -1.05e-02 | -2.69e-02 | 1.59e-03 | -2.56e-01 | 3.52e-02 | -2.42e-02 | -7.56e-03 | -2.26e-03 | -1.50e-02 | -1.77e-02 | 8.44e-03 | -1.54e-01 | -3.74e-02 | 3.01e-02 | -1.96e-02 | -5.28e-02 | -3.73e-02 | -6.63e-03 | -0.03 | -0.01 | -1.80e-04 | -1.23e-02 | 3.32e-02 | 1.00e+00 |
df_train.describe(include=['number']).loc[['min','max','mean']].T.sort_values('max')
min | max | mean | |
---|---|---|---|
BsmtHalfBath | 0.0 | 2.0 | 0.06 |
HalfBath | 0.0 | 2.0 | 0.38 |
FullBath | 0.0 | 3.0 | 1.57 |
KitchenAbvGr | 0.0 | 3.0 | 1.05 |
BsmtFullBath | 0.0 | 3.0 | 0.43 |
Fireplaces | 0.0 | 3.0 | 0.61 |
GarageCars | 0.0 | 4.0 | 1.77 |
BedroomAbvGr | 0.0 | 8.0 | 2.87 |
OverallCond | 1.0 | 9.0 | 5.58 |
OverallQual | 1.0 | 10.0 | 6.10 |
MoSold | 1.0 | 12.0 | 6.32 |
TotRmsAbvGrd | 2.0 | 14.0 | 6.52 |
MSSubClass | 20.0 | 190.0 | 56.90 |
LotFrontage | 21.0 | 313.0 | 70.05 |
ScreenPorch | 0.0 | 480.0 | 15.06 |
3SsnPorch | 0.0 | 508.0 | 3.41 |
OpenPorchSF | 0.0 | 547.0 | 46.66 |
EnclosedPorch | 0.0 | 552.0 | 21.95 |
LowQualFinSF | 0.0 | 572.0 | 5.84 |
PoolArea | 0.0 | 738.0 | 2.76 |
WoodDeckSF | 0.0 | 857.0 | 94.24 |
GarageArea | 0.0 | 1418.0 | 472.98 |
Id | 1.0 | 1460.0 | 730.50 |
BsmtFinSF2 | 0.0 | 1474.0 | 46.55 |
MasVnrArea | 0.0 | 1600.0 | 103.69 |
GarageYrBlt | 1900.0 | 2010.0 | 1978.51 |
YearRemodAdd | 1950.0 | 2010.0 | 1984.87 |
YearBuilt | 1872.0 | 2010.0 | 1971.27 |
YrSold | 2006.0 | 2010.0 | 2007.82 |
2ndFlrSF | 0.0 | 2065.0 | 346.99 |
BsmtUnfSF | 0.0 | 2336.0 | 567.24 |
1stFlrSF | 334.0 | 4692.0 | 1162.63 |
GrLivArea | 334.0 | 5642.0 | 1515.46 |
BsmtFinSF1 | 0.0 | 5644.0 | 443.64 |
TotalBsmtSF | 0.0 | 6110.0 | 1057.43 |
MiscVal | 0.0 | 15500.0 | 43.49 |
LotArea | 1300.0 | 215245.0 | 10516.83 |
SalePrice | 34900.0 | 755000.0 | 180921.20 |
#plotting
plt.figure(figsize=(8,3))
sns.distplot(df_train['SalePrice'], fit=norm)
mu, sigma = norm.fit(df_train['SalePrice'])
plt.legend(['Normal dist. ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)], loc='best')
plt.ylabel('Frequency')
print('skew={}'.format(skew(np.log1p(df_train['SalePrice']))))
skew=0.12122191311528363
The plot above shows very clearly that sale price has a right skewed distribution (positive skew). Normal distributions are very commonly used to normalize skewed data. This could be done because normal distribution are neccessary for a variety of statiscal analyzes
cols = ['OverallQual','OverallCond','SaleType','SaleCondition']
sorted_data = df_train.sort_values(by='SalePrice')
fig, axes = plt.subplots(ncols=4, nrows=1, figsize=(4 * 4, 3), sharey=True)
for i, c in zip(np.arange(len(axes)), cols):
sns.boxplot(x=c, y='SalePrice', data=sorted_data, ax=axes[i])
fig, axes = plt.subplots(ncols=4, nrows=1, figsize=(4 * 4, 3), sharey=True)
for i, c in zip(np.arange(len(axes)), cols):
sns.countplot(x=c, data=sorted_data, ax=axes[i])
n = df_train.select_dtypes(include=object)
for c in n.columns:
print('{:<14}'.format(c), df_train[c].unique())
MSZoning ['RL' 'RM' 'C (all)' 'FV' 'RH'] Street ['Pave' 'Grvl'] Alley [nan 'Grvl' 'Pave'] LotShape ['Reg' 'IR1' 'IR2' 'IR3'] LandContour ['Lvl' 'Bnk' 'Low' 'HLS'] Utilities ['AllPub' 'NoSeWa'] LotConfig ['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3'] LandSlope ['Gtl' 'Mod' 'Sev'] Neighborhood ['CollgCr' 'Veenker' 'Crawfor' 'NoRidge' 'Mitchel' 'Somerst' 'NWAmes' 'OldTown' 'BrkSide' 'Sawyer' 'NridgHt' 'NAmes' 'SawyerW' 'IDOTRR' 'MeadowV' 'Edwards' 'Timber' 'Gilbert' 'StoneBr' 'ClearCr' 'NPkVill' 'Blmngtn' 'BrDale' 'SWISU' 'Blueste'] Condition1 ['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe'] Condition2 ['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe'] BldgType ['1Fam' '2fmCon' 'Duplex' 'TwnhsE' 'Twnhs'] HouseStyle ['2Story' '1Story' '1.5Fin' '1.5Unf' 'SFoyer' 'SLvl' '2.5Unf' '2.5Fin'] RoofStyle ['Gable' 'Hip' 'Gambrel' 'Mansard' 'Flat' 'Shed'] RoofMatl ['CompShg' 'WdShngl' 'Metal' 'WdShake' 'Membran' 'Tar&Grv' 'Roll' 'ClyTile'] Exterior1st ['VinylSd' 'MetalSd' 'Wd Sdng' 'HdBoard' 'BrkFace' 'WdShing' 'CemntBd' 'Plywood' 'AsbShng' 'Stucco' 'BrkComm' 'AsphShn' 'Stone' 'ImStucc' 'CBlock'] Exterior2nd ['VinylSd' 'MetalSd' 'Wd Shng' 'HdBoard' 'Plywood' 'Wd Sdng' 'CmentBd' 'BrkFace' 'Stucco' 'AsbShng' 'Brk Cmn' 'ImStucc' 'AsphShn' 'Stone' 'Other' 'CBlock'] MasVnrType ['BrkFace' 'None' 'Stone' 'BrkCmn' nan] ExterQual ['Gd' 'TA' 'Ex' 'Fa'] ExterCond ['TA' 'Gd' 'Fa' 'Po' 'Ex'] Foundation ['PConc' 'CBlock' 'BrkTil' 'Wood' 'Slab' 'Stone'] BsmtQual ['Gd' 'TA' 'Ex' nan 'Fa'] BsmtCond ['TA' 'Gd' nan 'Fa' 'Po'] BsmtExposure ['No' 'Gd' 'Mn' 'Av' nan] BsmtFinType1 ['GLQ' 'ALQ' 'Unf' 'Rec' 'BLQ' nan 'LwQ'] BsmtFinType2 ['Unf' 'BLQ' nan 'ALQ' 'Rec' 'LwQ' 'GLQ'] Heating ['GasA' 'GasW' 'Grav' 'Wall' 'OthW' 'Floor'] HeatingQC ['Ex' 'Gd' 'TA' 'Fa' 'Po'] CentralAir ['Y' 'N'] Electrical ['SBrkr' 'FuseF' 'FuseA' 'FuseP' 'Mix' nan] KitchenQual ['Gd' 'TA' 'Ex' 'Fa'] Functional ['Typ' 'Min1' 'Maj1' 'Min2' 'Mod' 'Maj2' 'Sev'] FireplaceQu [nan 'TA' 'Gd' 'Fa' 'Ex' 'Po'] GarageType ['Attchd' 'Detchd' 'BuiltIn' 'CarPort' nan 'Basment' '2Types'] GarageFinish ['RFn' 'Unf' 'Fin' nan] GarageQual ['TA' 'Fa' 'Gd' nan 'Ex' 'Po'] GarageCond ['TA' 'Fa' nan 'Gd' 'Po' 'Ex'] PavedDrive ['Y' 'N' 'P'] PoolQC [nan 'Ex' 'Fa' 'Gd'] Fence [nan 'MnPrv' 'GdWo' 'GdPrv' 'MnWw'] MiscFeature [nan 'Shed' 'Gar2' 'Othr' 'TenC'] SaleType ['WD' 'New' 'COD' 'ConLD' 'ConLI' 'CWD' 'ConLw' 'Con' 'Oth'] SaleCondition ['Normal' 'Abnorml' 'Partial' 'AdjLand' 'Alloca' 'Family']
Exploring the relationship between Sale Price and Ground Living Area
fig, axes = plt.subplots(ncols=5, nrows=2, figsize=(16, 4), sharey=True)
axes = np.ravel(axes)
col_name = ['GrLivArea','TotalBsmtSF','1stFlrSF','BsmtFinSF1','LotArea']
for i, c in zip(range(5), col_name):
df_train.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='red')
# delete outliers
print(df_train.shape)
df_train = df_train[df_train['GrLivArea'] < 4000]
df_train = df_train[df_train['LotArea'] < 100000]
print(df_train.shape)
for i, c in zip(range(5,10), col_name):
df_train.plot.scatter(ax=axes[i], x=c, y='SalePrice', sharey=True, colorbar=False, c='navy')
(1460, 81) (1452, 81)
#Data Visualization
# histograms
df.hist(bins=10,figsize=(10,9),grid=False);
#Distribution of Sale Price variable
sns.set(style='whitegrid', palette="deep", font_scale=1.1, rc={"figure.figsize": [8, 5]})
sns.distplot(
df_train['SalePrice'], norm_hist=False, kde=False, bins=20, hist_kws={"alpha": 1}
).set(xlabel='Sale Price', ylabel='Count');
Lab 4: Hypothesis Testing
We'll use the ANOVA tesst to understand the correlation between categorical variables and SalePrice. ANOVA test is a statistical technique used to determine if there exists a significant difference in the mean of groups. Ho - There exists no significant difference between the groups. Ha - There exists a significant difference between the groups.
##separate variables into new data frames
numeric_data = df_train.select_dtypes(include=[np.number])
cat_data = df_train.select_dtypes(exclude=[np.number])
from scipy import stats
from scipy.stats import norm
cat = [f for f in df_train.columns if df_train.dtypes[f] == 'object']
def anova(frame):
anv = pd.DataFrame()
anv['features'] = cat
pvals = []
for c in cat:
samples = []
for cls in frame[c].unique():
s = frame[frame[c] == cls]['SalePrice'].values
samples.append(s)
pval = stats.f_oneway(*samples)[1]
pvals.append(pval)
anv['pval'] = pvals
return anv.sort_values('pval')
cat_data['SalePrice'] = df_train.SalePrice.values
k = anova(cat_data)
k['disparity'] = np.log(1./k['pval'].values)
sns.barplot(data=k, x = 'features', y='disparity')
plt.xticks(rotation=90)
/usr/local/lib/python3.6/dist-packages/ipykernel_launcher.py:18: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy /usr/local/lib/python3.6/dist-packages/scipy/stats/stats.py:3225: RuntimeWarning: invalid value encountered in double_scalars
(array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]), <a list of 43 Text xticklabel objects>)