Answer To: Instructionuse the data from the zip file ready the question carefully and answer every step and...
Ximi answered on Mar 23 2021
{
"cells": [
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"#Imports\n",
"import pandas as pd\n",
"from sklearn.ensemble import RandomForestClassifier\n",
"\n",
"df = pd.read_csv('data_homwork.csv')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(4999, 1805)"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"#Data rows and columns\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"#columns\n",
"columns = df.columns"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Random Forest Classifier\n",
"We will build a model over all features first and then using feature importances, we will reduce the feature set.\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"metadata": {},
"outputs": [],
"source": [
"#Making features and target variables\n",
"X = df.drop('target', axis=1, inplace=False)\n",
"y = df['target']"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {
"scrolled": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature ranking Top 10:\n",
"1. feature ent_q_diff_diffs_2_median (0.013538)\n",
"2. feature TB_77 (0.012635)\n",
"3. feature Img0.1 (0.011410)\n",
"4. feature TB_a9 (0.010800)\n",
"5. feature TB_b1 (0.010358)\n",
"6. feature _exit (0.010349)\n",
"7. feature TB_93 (0.008868)\n",
"8. feature TB_a3 (0.008600)\n",
"9. feature TB_82 (0.008381)\n",
"10. feature TB_aa (0.008079)\n"
]
}
],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"%matplotlib inline\n",
"\n",
"forest = RandomForestClassifier(n_estimators=50,\n",
" random_state=0)\n",
"\n",
"forest.fit(X, y)\n",
"importances = forest.feature_importances_\n",
"std = np.std([tree.feature_importances_ for tree in forest.estimators_],\n",
" axis=0)\n",
"indices = np.argsort(importances)[::-1]\n",
"\n",
"# Print the feature ranking\n",
"print(\"Feature ranking Top 10:\")\n",
"\n",
"for f in range(10):\n",
" print(\"%d. feature %s (%f)\" % (f + 1, columns[indices[f]], importances[indices[f]]))\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 43,
"metadata": {},
"outputs": [
{
"data": {
"image/png":...