As discussed in the chat all necessary data is provided. You need to use the skeleton jupyter notebook and fill in the "gaps" all the rest is in the description of the assignment. They are very picky...

1 answer below »
As discussed in the chat all necessary data is provided. You need to use the skeleton jupyter notebook and fill in the "gaps" all the rest is in the description of the assignment. They are very picky with plagiarism and libraries. no one is allowed.
Answered Same DayMar 11, 2021

Answer To: As discussed in the chat all necessary data is provided. You need to use the skeleton jupyter...

Ximi answered on Mar 13 2021
144 Votes
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Assignment 1"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import nltk\n",
"import math\n",
"import itertools\n",
"import scipy.stats\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.metrics import confusion_matrix\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib as mpl \n",
"
%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Daten einlesen"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"people_en = pd.read_csv('people_wiki_EN.csv', sep=',')\n",
"people_de = pd.read_csv('10k-people-raw.csv', sep=';')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hilfsfunktionen"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"def box_plot(dist, title = None):\n",
" chars = [chr(c) for c in range(ord('a'), ord('z') + 1)]\n",
" \n",
" fig1, ax1 = plt.subplots(figsize = (18, 8))\n",
" ax1.set_xlabel('Character')\n",
" ax1.set_ylabel('Frequency')\n",
" ax1.boxplot(dist, labels = chars)\n",
" #ax1.set_ylim([0, 0.3])\n",
" if title is not None:\n",
" plt.title(title)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def frequency_plot(mean_en, mean_de):\n",
" chars = [chr(c) for c in range(ord('a'), ord('z')+1)]\n",
" ind = np.arange(len(chars))\n",
" width = 1\n",
" frequency = np.log((np.array(mean_en) / np.array(mean_de)))\n",
" color = ['g' if f > 0 else 'b' for f in frequency]\n",
" \n",
" fig, ax = plt.subplots(figsize = (18, 8))\n",
" rects1 = ax.bar(ind, frequency, width, color = color, edgecolor = 'black')\n",
" ax.set_xticklabels(chars)\n",
" ax.set_xticks(ind + (width * 0.5) - 0.5)\n",
" ax.set_xlabel('Character')\n",
" ax.set_ylim([-2, 2])\n",
" ax.set_ylabel('F')\n",
" ax.set_xlim([-0.5, len(chars) - 0.5])\n",
" ax.text(1, 1.5, 'More frequent in English ', bbox={'facecolor':'green', 'alpha':0.5, 'pad':10})\n",
" ax.text(1, 1.2, 'More frequent in German', bbox={'facecolor':'blue', 'alpha':0.5, 'pad':10})\n",
" ax.text(5.5, 1.35, r'$F = \\ln(\\frac{mean_e}{mean_d})$', fontsize=25)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"def plot_confusion_matrix(cm, classes, title = 'Confusion matrix', cmap = plt.cm.Greens):\n",
" plt.title(title)\n",
" tick_marks = np.arange(len(classes))\n",
" plt.xticks(tick_marks, classes, rotation = 45)\n",
" plt.yticks(tick_marks, classes)\n",
"\n",
" cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
" cm_norm[np.isnan(cm_norm)] = 0\n",
" cm_norm = np.around(cm_norm, decimals = 3)\n",
" \n",
" plt.imshow(cm_norm, interpolation='nearest', cmap = cmap)\n",
" \n",
" thresh = cm_norm.max() / 2.\n",
" for i, j in itertools.product(range(cm_norm.shape[0]), range(cm_norm.shape[1])):\n",
" plt.text(j, i, cm[i, j], horizontalalignment=\"center\", color=\"white\" if cm_norm[i, j] > thresh else \"black\")\n",
"\n",
" plt.tight_layout()\n",
" plt.ylabel('True label')\n",
" plt.xlabel('Predicted label')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Häufigkeitsverteilungen"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"def character_distribution(people):\n",
" # Implementieren Sie eine Funktion die die Buchstabenhäufigkeiten berechnet.\n",
" # Die Funktion soll folgendes zurückgeben:\n",
" # mean: Durchschnittlicher relative Häufigkeit pro Buchstabe \n",
" # std: Relative Standardabweichung pro Buchstabe\n",
" # dist: Relative Häufigkeit pro Text und pro Buchstabe \n",
" mean = np.zeros(shape = 26)\n",
" std = np.zeros(shape = 26)\n",
" dist = np.zeros(shape = (len(people), 26))\n",
" \n",
" def get_chars():\n",
" return {chr(c):0 for c in range(ord('a'), ord('z')+1)}\n",
" def get_freq(text):\n",
" chars = get_chars()\n",
" for i in text: \n",
" if i in chars: \n",
" chars[i] += 1\n",
" return list(chars.values())\n",
" \n",
" freq = people.apply(get_freq)\n",
" mean = np.mean(list(freq), axis=0)\n",
" std = np.std(list(freq), axis=0)\n",
" dist = freq.apply(lambda x: mean/x)\n",
" dist = np.array(list(dist))\n",
" \n",
" return mean, std, dist"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"image/png":...
SOLUTION.PDF

Answer To This Question Is Available To Download

Related Questions & Answers

More Questions »

Submit New Assignment

Copy and Paste Your Assignment Here