Answer To: As discussed in the chat all necessary data is provided. You need to use the skeleton jupyter...
Ximi answered on Mar 13 2021
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Assignment 1"
]
},
{
"cell_type": "code",
"execution_count": 80,
"metadata": {},
"outputs": [],
"source": [
"import sys\n",
"import nltk\n",
"import math\n",
"import itertools\n",
"import scipy.stats\n",
"import numpy as np\n",
"import pandas as pd\n",
"from sklearn.metrics import confusion_matrix\n",
"import matplotlib.pyplot as plt\n",
"import matplotlib as mpl \n",
"%matplotlib inline"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Daten einlesen"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [],
"source": [
"people_en = pd.read_csv('people_wiki_EN.csv', sep=',')\n",
"people_de = pd.read_csv('10k-people-raw.csv', sep=';')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Hilfsfunktionen"
]
},
{
"cell_type": "code",
"execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
"def box_plot(dist, title = None):\n",
" chars = [chr(c) for c in range(ord('a'), ord('z') + 1)]\n",
" \n",
" fig1, ax1 = plt.subplots(figsize = (18, 8))\n",
" ax1.set_xlabel('Character')\n",
" ax1.set_ylabel('Frequency')\n",
" ax1.boxplot(dist, labels = chars)\n",
" #ax1.set_ylim([0, 0.3])\n",
" if title is not None:\n",
" plt.title(title)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 83,
"metadata": {},
"outputs": [],
"source": [
"def frequency_plot(mean_en, mean_de):\n",
" chars = [chr(c) for c in range(ord('a'), ord('z')+1)]\n",
" ind = np.arange(len(chars))\n",
" width = 1\n",
" frequency = np.log((np.array(mean_en) / np.array(mean_de)))\n",
" color = ['g' if f > 0 else 'b' for f in frequency]\n",
" \n",
" fig, ax = plt.subplots(figsize = (18, 8))\n",
" rects1 = ax.bar(ind, frequency, width, color = color, edgecolor = 'black')\n",
" ax.set_xticklabels(chars)\n",
" ax.set_xticks(ind + (width * 0.5) - 0.5)\n",
" ax.set_xlabel('Character')\n",
" ax.set_ylim([-2, 2])\n",
" ax.set_ylabel('F')\n",
" ax.set_xlim([-0.5, len(chars) - 0.5])\n",
" ax.text(1, 1.5, 'More frequent in English ', bbox={'facecolor':'green', 'alpha':0.5, 'pad':10})\n",
" ax.text(1, 1.2, 'More frequent in German', bbox={'facecolor':'blue', 'alpha':0.5, 'pad':10})\n",
" ax.text(5.5, 1.35, r'$F = \\ln(\\frac{mean_e}{mean_d})$', fontsize=25)\n",
" plt.show()"
]
},
{
"cell_type": "code",
"execution_count": 84,
"metadata": {},
"outputs": [],
"source": [
"def plot_confusion_matrix(cm, classes, title = 'Confusion matrix', cmap = plt.cm.Greens):\n",
" plt.title(title)\n",
" tick_marks = np.arange(len(classes))\n",
" plt.xticks(tick_marks, classes, rotation = 45)\n",
" plt.yticks(tick_marks, classes)\n",
"\n",
" cm_norm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
" cm_norm[np.isnan(cm_norm)] = 0\n",
" cm_norm = np.around(cm_norm, decimals = 3)\n",
" \n",
" plt.imshow(cm_norm, interpolation='nearest', cmap = cmap)\n",
" \n",
" thresh = cm_norm.max() / 2.\n",
" for i, j in itertools.product(range(cm_norm.shape[0]), range(cm_norm.shape[1])):\n",
" plt.text(j, i, cm[i, j], horizontalalignment=\"center\", color=\"white\" if cm_norm[i, j] > thresh else \"black\")\n",
"\n",
" plt.tight_layout()\n",
" plt.ylabel('True label')\n",
" plt.xlabel('Predicted label')\n",
" plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Häufigkeitsverteilungen"
]
},
{
"cell_type": "code",
"execution_count": 85,
"metadata": {},
"outputs": [],
"source": [
"def character_distribution(people):\n",
" # Implementieren Sie eine Funktion die die Buchstabenhäufigkeiten berechnet.\n",
" # Die Funktion soll folgendes zurückgeben:\n",
" # mean: Durchschnittlicher relative Häufigkeit pro Buchstabe \n",
" # std: Relative Standardabweichung pro Buchstabe\n",
" # dist: Relative Häufigkeit pro Text und pro Buchstabe \n",
" mean = np.zeros(shape = 26)\n",
" std = np.zeros(shape = 26)\n",
" dist = np.zeros(shape = (len(people), 26))\n",
" \n",
" def get_chars():\n",
" return {chr(c):0 for c in range(ord('a'), ord('z')+1)}\n",
" def get_freq(text):\n",
" chars = get_chars()\n",
" for i in text: \n",
" if i in chars: \n",
" chars[i] += 1\n",
" return list(chars.values())\n",
" \n",
" freq = people.apply(get_freq)\n",
" mean = np.mean(list(freq), axis=0)\n",
" std = np.std(list(freq), axis=0)\n",
" dist = freq.apply(lambda x: mean/x)\n",
" dist = np.array(list(dist))\n",
" \n",
" return mean, std, dist"
]
},
{
"cell_type": "code",
"execution_count": 86,
"metadata": {},
"outputs": [
{
"data": {
"image/png":...