I currently am facing problems with implementing my models. I need help with optimizing them.

Question

Ximi · Accepted Answer

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 111,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 119,
   "metadata": {},
   "outputs": [],
   "source": [
    "df1 = pd.read_csv('white-browedtreecreeper-2nqa4cjx.csv')
",
    "df2 = pd.read_csv('smalltriggerplant-aoxxxyz3.csv')
",
    "df3 = pd.read_csv('commonbeard-heath-fh1buzob.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 120,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.concat([df, df2, df3], axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 121,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Taxon ID                       int64
",
       "Scientific Name               object
",
       "Common Name                   object
",
       "Conservation Status           object
",
       "Cover Abundance               object
",
       "Total Count                   object
",
       "Survey Start Date             object
",
       "Survey End Date               object
",
       "Survey/Observation ID          int64
",
       "Site Location Description     object
",
       "Survey method                 object
",
       "Water Body                    object
",
       "Observer                      object
",
       "Extra Info                    object
",
       "Type of Record                object
",
       "Reliability                   object
",
       "Altitude                      object
",
       "Latitude GDA94               float64
",
       "Longitude GDA94              float64
",
       "Accuracy                       int64
",
       "Site ID                        int64
",
       "Site Name                     object
",
       "Drainage Division             object
",
       "River Basin                   object
",
       "Drains Into                   object
",
       "EPBC                          object
",
       "FFG                           object
",
       "Victorian Advisory List       object
",
       "Taxon Origin                  object
",
       "Treaty                        object
",
       "Licence Name                  object
",
       "Project ID                     int64
",
       "dtype: object"
      ]
     },
     "execution_count": 121,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.dtypes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 70,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = ['Survey Start Date', 'Survey End Date', 'Site Location Description', 
",
    "               'Survey Method', 'Observer', 'Type of Record']
",
    "label = 'Reliability'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 122,
   "metadata": {},
   "outputs": [],
   "source": [
    "from datetime import datetime
"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 123,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_date_object(string):
",
    "    return datetime.strptime(string, '%d-%m-%y')
",
    "
",
    "def get_date_delta(start, end):
",
    "    #print ('start', start, 'end', end)
",
    "    if len(start) and len(end) > 2:
",
    "     #   print (start, end)
",
    "        start = get_date_object(start)
",
    "        end = get_date_object(end)
",
    "        delta = end - start
",
    "        return delta.days
",
    "    else:
",
    "        return 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 124,
   "metadata": {},
   "outputs": [],
   "source": [
    "df['Survey Duration'] = df.apply(lambda x: get_date_delta(x['Survey Start Date'], x['Survey End Date']), axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 125,
   "metadata": {
    "scrolled": true
   },
   "outputs": [],
   "source": [
    "from sklearn.preprocessing import LabelEncoder"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 126,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np
",
    "le_survey = LabelEncoder()
",
    "le_observer = LabelEncoder()
",
    "le_record = LabelEncoder()
",
    "df['Survey method encoded'] = le_survey.fit_transform(df['Survey method'])
",
    "df['Observer encoded'] = le_observer.fit_transform(df['Observer'])
",
    "df['Type of Record encoded'] = le_record.fit_transform(df['Type of Record'])
"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 127,
   "metadata": {},
   "outputs": [],
   "source": [
    "features = ['Survey Duration', 'Survey method encoded', 'Observer encoded', 'Type of Record encoded']"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 134,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np
",
    "import matplotlib.pyplot as plt
",
    "from sklearn.ensemble import ExtraTreesClassifier
",
    "from sklearn.model_selection import train_test_split"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 129,
   "metadata": {},
   "outputs": [],
   "source": [
    "forest = ExtraTreesClassifier(n_estimators=250,
",
    "                              random_state=0)
"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 130,
   "metadata": {},
   "outputs": [],
   "source": [
    "X = df[features].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 131,
   "metadata": {},
   "outputs": [],
   "source": [
    "y = df[label].values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 135,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
",
       "           max_depth=None, max_features='auto', max_leaf_nodes=None,
",
       "           min_impurity_decrease=0.0, min_impurity_split=None,
",
       "           min_samples_leaf=1, min_samples_split=2,
",
       "           min_weight_fraction_leaf=0.0, n_estimators=250, n_jobs=None,
",
       "           oob_score=False, random_state=0, verbose=0, warm_start=False)"
      ]
     },
     "execution_count": 135,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_x, test_x, train_y, test_y = train_test_split(X,y, test_size=0.2)
",
    "forest.fit(train_x,train_y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 133,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Feature ranking:
",
      "1. feature 2 (0.471299)
",
      "2. feature 1 (0.207056)
",
      "3. feature 3 (0.196751)
",
      "4. feature 0 (0.124894)
"
     ]
    },
    {
     "data": {
      "image/png":

I currently am facing problems with implementing my models. I need help with optimizing them.

Answer To: I currently am facing problems with implementing my models. I need help with optimizing them.

Answer To This Question Is Available To Download

Related Questions & Answers

Submit New Assignment