"mpg","cylinders","displacement","horsepower","weight","acceleration","year","origin" 14.5,8,351,152,4215,12.8,76,1 25.5,4,140,89,2755,15.8,77,1 22.5,6,232,90,3085,17.6,76,1 13,8,307,130,4098,14,72,1...

1 answer below »
only 2 questions to be answered in python jupyter!


"mpg","cylinders","displacement","horsepower","weight","acceleration","year","origin" 14.5,8,351,152,4215,12.8,76,1 25.5,4,140,89,2755,15.8,77,1 22.5,6,232,90,3085,17.6,76,1 13,8,307,130,4098,14,72,1 27.9,4,156,105,2800,14.4,80,1 18.6,6,225,110,3620,18.7,78,1 33.5,4,151,90,2556,13.2,79,1 12,8,383,180,4955,11.5,71,1 29.5,4,98,68,2135,16.6,78,3 14,8,351,148,4657,13.5,75,1 24.3,4,151,90,3003,20.1,80,1 36.1,4,91,60,1800,16.4,78,3 27.2,4,141,71,3190,24.8,79,2 16.5,6,168,120,3820,16.7,76,2 17.5,6,258,95,3193,17.8,76,1 38,4,91,67,1995,16.2,82,3 14,8,455,225,4425,10,70,1 25,4,121,115,2671,13.5,75,2 17,6,163,125,3140,13.6,78,2 27,4,151,90,2950,17.3,82,1 22,4,121,98,2945,14.5,75,2 32.2,4,108,75,2265,15.2,80,3 21,6,231,110,3039,15,75,1 13,8,400,150,4464,12,73,1 10,8,307,200,4376,15,70,1 18,6,232,100,3288,15.5,71,1 16,8,400,230,4278,9.5,73,1 22,6,225,100,3233,15.4,76,1 21.5,4,121,110,2600,12.8,77,2 24,4,120,97,2489,15,74,3 32.7,6,168,132,2910,11.4,80,3 12,8,350,180,4499,12.5,73,1 29,4,97,75,2171,16,75,3 19,4,121,112,2868,15.5,73,2 26,4,97,75,2265,18.2,77,3 14,8,318,150,4237,14.5,73,1 25,4,90,71,2223,16.5,75,2 20.3,5,131,103,2830,15.9,78,2 27,4,97,88,2130,14.5,71,3 38.1,4,89,60,1968,18.8,80,3 22.4,6,231,110,3415,15.8,81,1 27.4,4,121,80,2670,15,79,1 24,4,121,110,2660,14,73,2 29,4,68,49,1867,19.5,73,2 26.6,8,350,105,3725,19,81,1 32.1,4,98,70,2120,15.5,80,1 34.2,4,105,70,2200,13.2,79,1 16,8,302,140,4141,14,74,1 30.9,4,105,75,2230,14.5,78,1 29.8,4,89,62,1845,15.3,80,2 26,4,108,93,2391,15.5,74,3 43.1,4,90,48,1985,21.5,78,2 13,8,350,150,4699,14.5,74,1 18,3,70,90,2124,13.5,73,3 19.4,6,232,90,3210,17.2,78,1 12,8,350,160,4456,13.5,72,1 30.5,4,97,78,2190,14.1,77,2 14,8,318,150,4096,13,71,1 23,4,120,97,2506,14.5,72,3 18,6,250,88,3139,14.5,71,1 22,4,122,86,2395,16,72,1 26,4,97,46,1835,20.5,70,2 13,8,350,145,4055,12,76,1 15.5,8,351,142,4054,14.3,79,1 23.5,6,173,110,2725,12.6,81,1 19.2,8,305,145,3425,13.2,78,1 35.1,4,81,60,1760,16.1,81,3 18,6,225,105,3613,16.5,74,1 18.1,8,302,139,3205,11.2,78,1 29,4,90,70,1937,14.2,76,2 16.5,8,350,180,4380,12.1,76,1 26,4,98,90,2265,15.5,73,2 13,8,350,145,3988,13,73,1 29.8,4,134,90,2711,15.5,80,3 29.5,4,97,71,1825,12.2,76,2 13,8,350,155,4502,13.5,72,1 27.2,4,135,84,2490,15.7,81,1 19,6,225,95,3264,16,75,1 18.5,8,360,150,3940,13,79,1 15.5,8,304,120,3962,13.9,76,1 28.8,6,173,115,2595,11.3,79,1 16,8,400,180,4220,11.1,77,1 30,4,135,84,2385,12.9,81,1 43.4,4,90,48,2335,23.7,80,2 22,4,140,72,2408,19,71,1 17,8,302,140,3449,10.5,70,1 33.5,4,85,70,1945,16.8,77,3 25,4,113,95,2228,14,71,3 39.1,4,79,58,1755,16.9,81,3 29,4,85,52,2035,22.2,76,1 34.1,4,91,68,1985,16,81,3 31,4,79,67,2000,16,74,2 24,4,90,75,2108,15.5,74,2 30,4,88,76,2065,14.5,71,2 20.5,6,231,105,3425,16.9,77,1 37.7,4,89,62,2050,17.3,81,3 18,6,250,105,3459,16,75,1 13,8,350,165,4274,12,72,1 37,4,119,92,2434,15,80,3 19,6,232,90,3211,17,75,1 20,6,232,100,2914,16,75,1 25,4,104,95,2375,17.5,70,2 15,8,383,170,3563,10,70,1 17.5,8,305,140,4215,13,76,1 37,4,85,65,1975,19.4,81,3 24,4,119,97,2545,17,75,3 15,6,258,110,3730,19,75,1 34.5,4,105,70,2150,14.9,79,1 19.8,6,200,85,2990,18.2,79,1 32,4,83,61,2003,19,74,3 32,4,135,84,2295,11.6,82,1 24,4,134,96,2702,13.5,75,3 11,8,429,208,4633,11,72,1 44,4,97,52,2130,24.6,82,2 32,4,85,70,1990,17,76,3 19,6,232,100,2901,16,74,1 23,4,122,86,2220,14,71,1 20,4,130,102,3150,15.7,76,2 9,8,304,193,4732,18.5,70,1 33.5,4,98,83,2075,15.9,77,1 31.9,4,89,71,1925,14,79,2 19.1,6,225,90,3381,18.7,80,1 15,8,318,150,3777,12.5,73,1 33,4,105,74,2190,14.2,81,2 23,8,350,125,3900,17.4,79,1 16.2,6,163,133,3410,15.8,78,2 26,4,79,67,1963,15.5,74,2 17.6,6,225,85,3465,16.6,81,1 13,8,302,140,4294,16,72,1 26,4,91,70,1955,20.5,71,1 14,8,318,150,4077,14,72,1 24.2,6,146,120,2930,13.8,81,3 29.9,4,98,65,2380,20.7,81,1 22,6,232,112,2835,14.7,82,1 37.2,4,86,65,2019,16.4,80,3 18,6,232,100,2789,15,73,1 18,6,199,97,2774,15.5,70,1 14,8,304,150,3672,11.5,73,1 26,4,98,79,2255,17.7,76,1 20.5,6,225,100,3430,17.2,78,1 17,6,231,110,3907,21,75,1 27,4,140,86,2790,15.6,82,1 31.5,4,98,68,2045,18.5,77,3 16,6,250,100,3278,18,73,1 12,8,455,225,4951,11,73,1 15.5,8,318,145,4140,13.7,77,1 20.5,6,200,95,3155,18.2,78,1 25,4,116,81,2220,16.9,76,2 28,4,112,88,2605,19.6,82,1 17.5,8,305,145,3880,12.5,77,1 15,8,304,150,3892,12.5,72,1 19,6,250,100,3282,15,71,1 13,8,302,130,3870,15,76,1 15,8,318,150,3399,11,73,1 19,6,156,108,2930,15.5,76,3 18,6,258,110,2962,13.5,71,1 36,4,120,88,2160,14.5,82,3 30.7,6,145,76,3160,19.6,81,2 17,8,260,110,4060,19,77,1 31,4,79,67,1950,19,74,3 34.4,4,98,65,2045,16.2,81,1 12,8,429,198,4952,11.5,73,1 26,4,121,113,2234,12.5,70,2 22,6,146,97,2815,14.5,77,3 14,8,351,153,4129,13,72,1 16.9,8,350,155,4360,14.9,79,1 21,6,199,90,2648,15,70,1 34,4,112,88,2395,18,82,1 21.5,3,80,110,2720,13.5,77,3 34.1,4,86,65,1975,15.2,79,3 20,4,140,90,2408,19.5,72,1 27.2,4,119,97,2300,14.7,78,3 46.6,4,86,65,2110,17.9,80,3 23,4,97,54,2254,23.5,72,2 14,8,351,153,4154,13.5,71,1 21,6,155,107,2472,14,73,1 21.1,4,134,95,2515,14.8,78,3 11,8,318,210,4382,13.5,70,1 27,4,97,60,1834,19,71,2 15,6,250,72,3432,21,75,1 28,4,97,75,2155,16.4,76,3 24,4,107,90,2430,14.5,70,2 16.5,8,351,138,3955,13.2,79,1 18,6,250,78,3574,21,76,1 28,4,120,79,2625,18.6,82,1 15,8,318,150,4135,13.5,72,1 32.9,4,119,100,2615,14.8,81,3 40.8,4,85,65,2110,19.2,80,3 24.5,4,98,60,2164,22.1,76,1 13,8,400,190,4422,12.5,72,1 35,4,72,69,1613,18,71,3 16,6,225,105,3439,15.5,71,1 20.8,6,200,85,3070,16.7,78,1 26,4,97,46,1950,21,73,2 25,4,140,92,2572,14.9,76,1 23,6,198,95,2904,16,73,1 30,4,79,70,2074,19.5,71,2 15,8,390,190,3850,8.5,70,1 32.4,4,107,72,2290,17,80,3 13,8,302,129,3169,12,75,1 17,8,305,130,3840,15.4,79,1 10,8,360,215,4615,14,70,1 17.5,6,250,110,3520,16.4,77,1 32.4,4,108,75,2350,16.8,81,3 17.5,8,318,140,4080,13.7,78,1 26,4,122,80,2451,16.5,74,1 16,8,318,150,4190,13,76,1 11,8,400,150,4997,14,73,1 23.7,3,70,100,2420,12.5,80,3 28,4,98,80,2164,15,72,1 44.3,4,90,48,2085,21.7,80,2 13,8,360,170,4654,13,73,1 20,8,262,110,3221,13.5,75,1 22,6,250,105,3353,14.5,76,1 26.4,4,140,88,2870,18.1,80,1 14,8,350,165,4209,12,71,1 18,4,121,112,2933,14.5,72,2 15.5,8,400,190,4325,12.2,77,1 28,4,97,92,2288,17,72,3 33,4,91,53,1795
Answered 1 days AfterMar 23, 2021

Answer To: "mpg","cylinders","displacement","horsepower","weight","acceleration","year","origin"...

Vicky answered on Mar 24 2021
151 Votes
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Q1. The purpose of this question is to implement your own version of PCR. Overall the PCR implementation is no more than a dozen lines of code, but I would be walking you through the steps, so that\n",
"you fully understand what is being done. The advantage of knowing this is you will not be restricted to\n",
"what PCR does for fitting (meaning that first doing a dimension reduction and then doing a linear fit). In\n",
"the future you can perform the dimension reduction step, and then instead of doing a linear fit, pick any\n",
"other algorithm of your choice, such as random forest, neural networks, etc. To answer this question, you\n",
"may find Slide 24 of lecture 8 useful. Also, during the afternoon session recording (video time 3:18:18) one\n",
"of the students asked a question and you may find the answer to it very related to this question."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# import libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.preprocessing import scale \n",
"from sklearn.decomposition import PCA\n",
"from sklearn import model_selection\n",
"from sklearn.linear_model import LinearRegression \n",
"from sklearn import metrics"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"
AtBatHitsHmRunRunsRBIWalksYearsCAtBatCHitsCHmRunCRunsCRBICWalksLeagueDivisionPutOutsAssistsErrorsSalaryNewLeague
04751232776937241810471108292343267102261061220.0001
158415815708442523586365826531613410331204662.5001
248412720666567730068441164364583771012318071183.3331
3642211141075952523647702735223019311337194740.0001
431181342302617824721981009509096901115322310320.0001
\n",
"
"
],
"text/plain": [
" AtBat Hits HmRun Runs RBI Walks Years CAtBat CHits CHmRun CRuns \\\n",
"0 475 123 27 76 93 72 4 1810 471 108 292 \n",
"1 584 158 15 70 84 42 5 2358 636 58 265 \n",
"2 484 127 20 66 65 67 7 3006 844 116 436 \n",
"3 642 211 14 107 59 52 5 2364 770 27 352 \n",
"4 311 81 3 42 30 26 17 8247 2198 100 950 \n",
"\n",
" CRBI CWalks League Division PutOuts Assists Errors Salary \\\n",
"0 343 267 1 0 226 10 6 1220.000 \n",
"1 316 134 1 0 331 20 4 662.500 \n",
"2 458 377 1 0 1231 80 7 1183.333 \n",
"3 230 193 1 1 337 19 4 740.000 \n",
"4 909 690 1 1 153 223 10 320.000 \n",
"\n",
" NewLeague \n",
"0 1 \n",
"1 1 \n",
"2 1 \n",
"3 1 \n",
"4 1 "
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Reading data\n",
"data = pd.read_csv('myhitters.csv')\n",
"data.head()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"\n",
"
countmeanstdmin25%50%75%max
AtBat263.0403.642586147.30720919.0282.5413.0526.0687.0
Hits263.0107.82889745.1253261.071.5103.0141.5238.0
HmRun263.011.6197728.7571080.05.09.018.040.0
Runs263.054.74524725.5398160.033.552.073.0130.0
RBI263.051.48669225.8827140.030.047.071.0121.0
Walks263.041.11406821.7180560.023.037.057.0105.0
Years263.07.3117874.7936161.04.06.010.024.0
CAtBat263.02657.5437262286.58292919.0842.51931.03890.514053.0
CHits263.0722.186312648.1996444.0212.0516.01054.04256.0
CHmRun263.069.23954482.1975810.015.040.092.5548.0
CRuns263.0361.220532331.1985712.0105.5250.0497.52165.0
CRBI263.0330.418251323.3676683.095.0230.0424.51659.0
CWalks263.0260.266160264.0558681.071.0174.0328.51566.0
League263.00.4714830.5001380.00.00.01.01.0
Division263.00.5095060.5008630.00.01.01.01.0
PutOuts263.0290.711027279.9345750.0113.5224.0322.51377.0
Assists263.0118.760456145.0805770.08.045.0192.0492.0
Errors263.08.5931566.6065740.03.07.013.032.0
Salary263.0535.925882451.11868167.5190.0425.0750.02460.0
NewLeague263.00.4638780.4996440.00.00.01.01.0
\n",
"
"
],
"text/plain": [
" count mean std min 25% 50% 75% \\\n",
"AtBat 263.0 403.642586 147.307209 19.0 282.5 413.0 526.0 \n",
"Hits 263.0 107.828897 45.125326 1.0 71.5 103.0 141.5 \n",
"HmRun 263.0 11.619772 8.757108 0.0 5.0 9.0 18.0 \n",
"Runs 263.0 54.745247 25.539816 0.0 33.5 52.0 73.0 \n",
"RBI 263.0 51.486692 25.882714 0.0 30.0 47.0 71.0 \n",
"Walks 263.0 41.114068 21.718056 0.0 23.0 37.0 57.0 \n",
"Years 263.0 7.311787 4.793616 1.0 4.0 6.0 10.0 \n",
"CAtBat 263.0 2657.543726 2286.582929 19.0 842.5 1931.0 3890.5 \n",
"CHits 263.0 722.186312 648.199644 4.0 212.0 516.0 1054.0 \n",
"CHmRun 263.0 69.239544 82.197581 0.0 15.0 40.0 92.5 \n",
"CRuns 263.0 361.220532 331.198571 2.0 105.5 250.0 497.5 \n",
"CRBI 263.0 330.418251 323.367668 3.0 95.0 230.0 424.5 \n",
"CWalks 263.0 260.266160 264.055868 1.0 71.0 174.0 328.5 \n",
"League 263.0 0.471483 0.500138 0.0 0.0 0.0 1.0 \n",
"Division 263.0 0.509506 0.500863 0.0 0.0 1.0 1.0 \n",
"PutOuts 263.0 290.711027 279.934575 0.0 113.5 224.0 322.5 \n",
"Assists 263.0 118.760456 145.080577 0.0 8.0 45.0 192.0 \n",
"Errors 263.0 8.593156 6.606574 0.0 3.0 7.0 13.0 \n",
"Salary 263.0 535.925882 451.118681 67.5 190.0 425.0 750.0 \n",
"NewLeague 263.0 0.463878 0.499644 0.0 0.0 0.0 1.0 \n",
"\n",
" max \n",
"AtBat 687.0 \n",
"Hits 238.0 \n",
"HmRun 40.0 \n",
"Runs 130.0 \n",
"RBI 121.0 \n",
"Walks 105.0 \n",
"Years 24.0 \n",
"CAtBat 14053.0 \n",
"CHits 4256.0 \n",
"CHmRun 548.0 \n",
"CRuns 2165.0 \n",
"CRBI 1659.0 \n",
"CWalks 1566.0 \n",
"League 1.0 \n",
"Division 1.0 \n",
"PutOuts 1377.0 \n",
"Assists 492.0 \n",
"Errors 32.0 \n",
"Salary 2460.0 \n",
"NewLeague 1.0 "
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.describe().T"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(263, 20)"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data.shape"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"– (a) In the homework folder you would have access to the dataset MyHitters.csv, where the ultimate\n",
"goal is modeling the variable Salary in terms of the other columns of the data. Split the data into a\n",
"Train and Test set. Use the first 131 rows of the data for training and the next 132 rows as test. Fit\n",
"a linear model to the Train, which models Salary in terms of all other features. Report the MSE of\n",
"the Test."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"X = data[['AtBat','Hits','HmRun','Runs','RBI','Walks','Years','CAtBat','CHits','CHmRun','CRuns','CRBI','CWalks','League','Division','PutOuts','Assists','Errors','NewLeague']]\n",
"y = data['Salary']\n",
"\n",
"X_train = X[:131]\n",
"y_train = y[:131]\n",
"X_test = X[131:]\n",
"y_test = y[131:]"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"LinearRegression()"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"lr = LinearRegression() \n",
"lr.fit(X_train, y_train)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"y_pred = lr.predict(X_test) # Predicting the scores"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Mean Squared Error: 114780.61044842948\n"
]
}
],
"source": [
"print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred)) "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"– (b) Use the pcr command in R (or Python) to fit a model that predicts the response variable Salary.\n",
"In your pcr function and for validation, use the option ‘‘LOO’’ instead of ‘‘CV’’, so that your\n",
"cross validation is done in a deterministic manner. To select the number of components for your\n",
"prediction, use the number of components that minimizes the cross-validation error. You should see\n",
"that 6 components are enough to get the best cross validation error. Present the graph of crossvalidation in terms of the number of components, and report the Test accuracy. Do you see an\n",
"improvement in accuracy, compared to part (a)?\n"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [],
"source": [
"# Scale the data\n",
"pca = PCA()\n",
"X_reduced_train = pca.fit_transform(scale(X_train))\n",
"n = len(X_reduced_train)"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"
"
]
},
"metadata": {
"needs_background": "light"
},
"output_type": "display_data"
}
],
"source": [
"# create loocv procedure\n",
"cv = model_selection.LeaveOneOut()\n",
"\n",
"regr = LinearRegression()\n",
"mse = []\n",
"\n",
"# Calculate MSE using CV for the 19 principle components, adding one component at the time.\n",
"for i in np.arange(1, 20):\n",
" score = -1*model_selection.cross_val_score(regr, X_reduced_train[:,:i], y_train.ravel(), cv=cv, scoring='neg_mean_squared_error').mean()\n",
" mse.append(score)\n",
"\n",
"plt.plot(np.array(mse), '-v')\n",
"plt.xlabel('Number of principal components in regression')\n",
"plt.ylabel('MSE')\n",
"plt.title('Salary')\n",
"plt.xlim(xmin=-1);"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We find that the lowest cross-validation error occurs when M=5 components are used."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"96162.91969792108"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"X_reduced_test = pca.transform(scale(X_test))[:,:6]\n",
"\n",
"# Train regression model on training data \n",
"regr = LinearRegression()\n",
"regr.fit(X_reduced_train[:,:6], y_train)\n",
"\n",
"# Prediction with test data\n",
"pred = regr.predict(X_reduced_test)\n",
"metrics.mean_squared_error(y_test, pred)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"MSE in part (a) is greater than (b). So, accuracy in part (b) is greater than part (a)."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"– (c) Now let’s implement your own version of PCR! Consider X to be the feature matrix of the entire\n",
"data (not only the Train or Test), and y the response column in the entire data. To implement\n",
"your own PCR you would need to follow the steps below:\n",
"\n",
"– First, center and scale X. This step should be consistently done once for the entire data, NOT\n",
"the Train and Test, separately. For this purpose you can use the command below in R:\n",
"\n",
"X = scale(X,scale=TRUE)\n",
"\n",
"Now consider Xtr and Xts to be the portions of the centered/scaled X corresponding to the\n",
"Train and Test, and ytr and yts to be the portions of y corresponding to the Train and Test,\n",
"respectively.\n",
"\n",
"– Take an SVD of Xtr to produce the matrices Utr, Σtr and V tr. Since similar to part (b), you\n",
"only want to use 6 principal components, only keep the first 6 columns of Utr and V tr and only\n",
"keep the 6 rows and columns of Σtr. The reduced sizes of the matrices Utr, Σtr and V tr should\n",
"be 131 × 6, 6 × 6 and 19 × 6, respectively.\n",
"\n",
"– Next, form the matrix C as in slide 24. This matrix is simply the product of your reduced-size\n",
"matrices V tr and Σ\n",
"−1\n",
"tr . This matrix is now like the magic transformer that maps your data from\n",
"the X space, to the reduced-dimension space.\n",
"\n",
"– Next, fit a linear model which predicts ytr in terms of the 6 features in Utr. Notice that Utr is\n",
"already the mapping of Xtr to the reduced-dimension space.\n",
"\n",
"– In order to test your linear fit in the reduced-dimension space, calculate Uts = XtsC, which is\n",
"basically the transformation of your test data to the reduced-dimension space. Report the MSE\n",
"value of you linear fit when tested against Uts.\n",
"\n",
"– You should see that your accuracy is around the one in part (b). The numbers might be up to\n",
"1% different, but that is mainly due to slight algorithmic differences."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"from scipy.linalg import svd"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"X = scale(X)"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [],
"source": [
"Xtr = X[:131]\n",
"ytr = y[:131]\n",
"Xts = X[131:]\n",
"yts = y[131:]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(131, 19)"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Xtr.shape"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"Utr, Str, Vtr = svd(Xtr, full_matrices=True)"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
"Utr = Utr[:,:6]\n",
"Vtr = Vtr[:,:6]\n",
"Str = Str[:6]"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(131, 6)"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Utr.shape"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(6,)"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Str.shape"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(19, 6)"
]
},
"execution_count": 20,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"Vtr.shape"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [],
"source": [
"C = Vtr*Str"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([[-7.73625299e+00, -5.74385005e+00, -3.24624098e+00,\n",
" -3.29896293e+00, -2.83254407e+00, -2.36595868e+00],\n",
" [ 1.14958859e+01, 8.81323513e+00, 2.65627483e+00,\n",
" 5.18240420e+00, 2.95082698e+00, 2.18224488e+00],\n",
" [-7.26150625e-01, 4.40116284e-02, 1.74128267e+00,\n",
" 9.07134725e-01, 4.37927390e-01, -4.96799632e-01],\n",
" [ 1.14354272e+00, 4.33918617e-01, -5.04128573e+00,\n",
" -1.38984015e+00, -1.61380327e+00, -1.34290238e+00],\n",
" [-3.05664873e+00, -1.62771265e+00, 3.27371480e-01,\n",
" -5.14157362e-01, 2.45629431e-01, -1.12979377e-01],\n",
" [...
SOLUTION.PDF

Answer To This Question Is Available To Download

Related Questions & Answers

More Questions »

Submit New Assignment

Copy and Paste Your Assignment Here