Answer To: in the zip file PDF are all instructions
Shivinder answered on Apr 18 2021
import re
import string
import operator
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.linalg import svd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from sklearn.manifold import TSNE
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import CountVectorizer
######################################Subchallenge 1################################################
dataset = pd.read_csv('jobsen-3iocijeo.csv', header = None,names = ['Job Desc'], encoding = 'ISO-8859-1')
dataset = dataset.dropna()
dataset.reset_index(inplace = True, drop = True)
def clean_mail(text_lines):
#Remove punctuation
text_lines = re.sub(r'([{}“”¨«»®´·º½¾¿¡§£₤‘’])'.format(string.punctuation), ' ',text_lines)
text_lines = re.sub(r'[^a-z]?[0-9]+', '', text_lines)
text_lines = re.sub(r'([a-z]+[0-9]+)|([0-9]+[a-z]+)','', text_lines)
stop_words = stopwords.words('english')
text_lines = ' '.join([word for word in text_lines.split(' ') if word and word not in stop_words])
return text_lines
def non_it_job(text_lines):
if re.search(r'(sql)|(linux)|(java)|(.net)|(python)|(windows)|(linux)|(network)|(eclipse)|(agile)',text_lines.lower()) or re.search(r'IT', text_lines):
return 'Yes'
else:
return 'No'
dataset['Job Desc'] = dataset['Job Desc'].apply(clean_mail)
dataset['IT_or_Non_IT'] = dataset['Job Desc'].apply(non_it_job)
print('Non It jobs ads are')
for pos in range(len(dataset)):
if dataset['IT_or_Non_IT'].iloc[pos] == 'No':
print(dataset['Job Desc'].iloc[pos])
vector = CountVectorizer()
ad_Vectors = vector.fit_transform(dataset['Job Desc']).toarray() #Vector space of ads
print(ad_Vectors)
dataset['Job Id'] = ''
for i in range(len(dataset)):
dataset['Job Id'].iloc[i] = 'Job_{}'.format(i)
######################################Subchallenge 2###############################################
# create kmeans object
kmeans = KMeans(n_clusters=4) #Divide jobs data in 4 clusters
# fit kmeans object to data
kmeans.fit(ad_Vectors)
# print location of clusters learned by kmeans object
print(kmeans.cluster_centers_)
# save new clusters for chart
job_cluster = kmeans.fit_predict(ad_Vectors)
dataset['Job Cluster'] = job_cluster
#####################################Subchallenge 3######################################################
def getVakanzen(anz_vak):
# Vacancies:
# [Job Title, Industry, Training, Job Percentage, Location, Gender, Age, Languages 1, ..., Languages M, Skill 1, ..., Skill N]
#
# Job name:
# 1: Computer Scientist (Mediamatic) (25%)
# 2: Developer (Software Developer / Developer / Application Engineer / Software Engineer) (50%)
# 3: Project Leader (10%)
# 4: IT supporter (5%)
# 5: System Administrator (5%)
# 6: Database Manager (5%)
#
# Industry:
# 1: technique
# 2: finances
# 3: construction
# 4: health
# 5: Insurance
#
# Training:
# 1: apprenticeship
# 2: Graduation HF
# 3: Graduation FH
# 4: Degree UNI / ETH
# 5: Doctorate
#
# Percentage: 0% => 0/100% => 1
#
# Place: Cantons 1-26
#
# Gender: m => 0 / w => 1
#
# Age: 20 => 0 - 65 => 1
#
# Languages: no knowledge => 0 / mother tongue => 1
# Language 1: de
# Language 2: en
# Language 3: fr
# Language 4: it
#
# Skills: no knowledge => 0 / professional => 1
# Skill 1: Java
# Skill 2: SQL
# Skill 3: .Net
# Skill 4: Python
# Skill 5: Windows
# Skill 6: Linux
# Skill 7:...