Answer To: I currently am facing problems with implementing my models. I need help with optimizing them.
Ximi answered on Oct 10 2021
{
"cells": [
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 119,
"metadata": {},
"outputs": [],
"source": [
"df1 = pd.read_csv('white-browedtreecreeper-2nqa4cjx.csv')\n",
"df2 = pd.read_csv('smalltriggerplant-aoxxxyz3.csv')\n",
"df3 = pd.read_csv('commonbeard-heath-fh1buzob.csv')"
]
},
{
"cell_type": "code",
"execution_count": 120,
"metadata": {},
"outputs": [],
"source": [
"df = pd.concat([df, df2, df3], axis=0)"
]
},
{
"cell_type": "code",
"execution_count": 121,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Taxon ID int64\n",
"Scientific Name object\n",
"Common Name object\n",
"Conservation Status object\n",
"Cover Abundance object\n",
"Total Count object\n",
"Survey Start Date object\n",
"Survey End Date object\n",
"Survey/Observation ID int64\n",
"Site Location Description object\n",
"Survey method object\n",
"Water Body object\n",
"Observer object\n",
"Extra Info object\n",
"Type of Record object\n",
"Reliability object\n",
"Altitude object\n",
"Latitude GDA94 float64\n",
"Longitude GDA94 float64\n",
"Accuracy int64\n",
"Site ID int64\n",
"Site Name object\n",
"Drainage Division object\n",
"River Basin object\n",
"Drains Into object\n",
"EPBC object\n",
"FFG object\n",
"Victorian Advisory List object\n",
"Taxon Origin object\n",
"Treaty object\n",
"Licence Name object\n",
"Project ID int64\n",
"dtype: object"
]
},
"execution_count": 121,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.dtypes"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [],
"source": [
"features = ['Survey Start Date', 'Survey End Date', 'Site Location Description', \n",
" 'Survey Method', 'Observer', 'Type of Record']\n",
"label = 'Reliability'"
]
},
{
"cell_type": "code",
"execution_count": 122,
"metadata": {},
"outputs": [],
"source": [
"from datetime import datetime\n"
]
},
{
"cell_type": "code",
"execution_count": 123,
"metadata": {},
"outputs": [],
"source": [
"def get_date_object(string):\n",
" return datetime.strptime(string, '%d-%m-%y')\n",
"\n",
"def get_date_delta(start, end):\n",
" #print ('start', start, 'end', end)\n",
" if len(start) and len(end) > 2:\n",
" # print (start, end)\n",
" start = get_date_object(start)\n",
" end = get_date_object(end)\n",
" delta = end - start\n",
" return delta.days\n",
" else:\n",
" return 0"
]
},
{
"cell_type": "code",
"execution_count": 124,
"metadata": {},
"outputs": [],
"source": [
"df['Survey Duration'] = df.apply(lambda x: get_date_delta(x['Survey Start Date'], x['Survey End Date']), axis=1)"
]
},
{
"cell_type": "code",
"execution_count": 125,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"from sklearn.preprocessing import LabelEncoder"
]
},
{
"cell_type": "code",
"execution_count": 126,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"le_survey = LabelEncoder()\n",
"le_observer = LabelEncoder()\n",
"le_record = LabelEncoder()\n",
"df['Survey method encoded'] = le_survey.fit_transform(df['Survey method'])\n",
"df['Observer encoded'] = le_observer.fit_transform(df['Observer'])\n",
"df['Type of Record encoded'] = le_record.fit_transform(df['Type of Record'])\n"
]
},
{
"cell_type": "code",
"execution_count": 127,
"metadata": {},
"outputs": [],
"source": [
"features = ['Survey Duration', 'Survey method encoded', 'Observer encoded', 'Type of Record encoded']"
]
},
{
"cell_type": "code",
"execution_count": 134,
"metadata": {},
"outputs": [],
"source": [
"import numpy as np\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.ensemble import ExtraTreesClassifier\n",
"from sklearn.model_selection import train_test_split"
]
},
{
"cell_type": "code",
"execution_count": 129,
"metadata": {},
"outputs": [],
"source": [
"forest = ExtraTreesClassifier(n_estimators=250,\n",
" random_state=0)\n"
]
},
{
"cell_type": "code",
"execution_count": 130,
"metadata": {},
"outputs": [],
"source": [
"X = df[features].values"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"y = df[label].values"
]
},
{
"cell_type": "code",
"execution_count": 135,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',\n",
" max_depth=None, max_features='auto', max_leaf_nodes=None,\n",
" min_impurity_decrease=0.0, min_impurity_split=None,\n",
" min_samples_leaf=1, min_samples_split=2,\n",
" min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,\n",
" oob_score=False, random_state=0, verbose=0, warm_start=False)"
]
},
"execution_count": 135,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.2)\n",
"forest.fit(train_x,train_y)"
]
},
{
"cell_type": "code",
"execution_count": 133,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Feature ranking:\n",
"1. feature 2 (0.471299)\n",
"2. feature 1 (0.207056)\n",
"3. feature 3 (0.196751)\n",
"4. feature 0 (0.124894)\n"
]
},
{
"data": {
"image/png":...