Answer To: Objective Investigate and assess Data Engineering approaches to integrate handwriting recognition...
Sandeep Kumar answered on Mar 01 2021
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### GPU benchmark on MNIST"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Using TensorFlow backend.\n"
]
}
],
"source": [
"import tensorflow as tf\n",
"import numpy as np \n",
"import matplotlib.pyplot as plt\n",
"import keras as k\n",
"from tensorflow.examples.tutorials.mnist import input_data\n",
"from keras.datasets import mnist\n",
"from keras.models import Sequential\n",
"from keras.layers import Dense, Dropout, Flatten\n",
"from keras.layers import Conv2D, MaxPooling2D, BatchNormalization\n",
"from keras.optimizers import SGD, Adam\n",
"from keras.models import load_model\n",
"from keras import backend as K\n",
"import requests\n",
"import os\n",
"from random import randint\n",
"\n",
"try:\n",
" from tqdm import tqdm\n",
"except ImportError:\n",
" tqdm = lambda x, total, unit: x # If tqdm doesn't exist, replace it with a function that does nothing\n",
" print('**** Could not import tqdm. Please install tqdm for download progressbars! (pip install tqdm) ****')\n",
"\n",
"# Python2 compatibility\n",
"try:\n",
" input = raw_input\n",
"except NameError:\n",
" pass\n",
"\n",
"download_dict = {\n",
" '1) Kuzushiji-MNIST (10 classes, 28x28, 70k examples)': {\n",
" '1) MNIST data format (ubyte.gz)':\n",
" ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-images-idx3-ubyte.gz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/train-labels-idx1-ubyte.gz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-images-idx3-ubyte.gz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/t10k-labels-idx1-ubyte.gz'],\n",
" '2) NumPy data format (.npz)':\n",
" ['http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-imgs.npz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-train-labels.npz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-imgs.npz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/kmnist/kmnist-test-labels.npz'],\n",
" },\n",
" '2) Kuzushiji-49 (49 classes, 28x28, 270k examples)': {\n",
" '1) NumPy data format (.npz)':\n",
" ['http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-imgs.npz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-train-labels.npz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-imgs.npz',\n",
" 'http://codh.rois.ac.jp/kmnist/dataset/k49/k49-test-labels.npz'],\n",
" },\n",
" '3) Kuzushiji-Kanji (3832 classes, 64x64, 140k examples)': {\n",
" '1) Folders of images (.tar)':\n",
" ['http://codh.rois.ac.jp/kmnist/dataset/kkanji/kkanji.tar'],\n",
" }\n",
"\n",
"}\n",
"\n",
"# Download a list of files\n",
"def download_list(url_list):\n",
" for url in url_list:\n",
" path = url.split('/')[-1]\n",
" r = requests.get(url, stream=True)\n",
" with open(path, 'wb') as f:\n",
" total_length = int(r.headers.get('content-length'))\n",
" print('Downloading {} - {:.1f} MB'.format(path, (total_length / 1024000)))\n",
"\n",
" for chunk in tqdm(r.iter_content(chunk_size=1024), total=int(total_length / 1024) + 1, unit=\"KB\"):\n",
" if chunk:\n",
" f.write(chunk)\n",
" print('All dataset files downloaded!')\n",
"\n",
"# Ask the user about which path to take down the dict\n",
"def traverse_dict(d):\n",
" print('Please select a download option:')\n",
" keys = sorted(d.keys()) # Print download options\n",
" for key in keys:\n",
" print(key)\n",
"\n",
" userinput = input('> ').strip()\n",
"\n",
" try:\n",
" selection = int(userinput) - 1\n",
" except ValueError:\n",
" print('Your selection was not valid')\n",
" traverse_dict(d) # Try again if input was not valid\n",
" return\n",
"\n",
" selected = keys[selection]\n",
"\n",
" next_level = d[selected]\n",
" if isinstance(next_level, list): # If we've hit a list of downloads, download that list\n",
" download_list(next_level)\n",
" else:\n",
" traverse_dict(next_level) # Otherwise, repeat with the next level\n",
"\n",
"traverse_dict(download_dict)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"APP_NAME = '%s-%d' % ('fashion-mnist', randint(0, 100))\n",
"LOG_FORMAT = '%(asctime)-15s %(filename)s:%(funcName)s:[%(levelname)s] %(message)s'\n",
"JSON_FORMAT = '%(message)s'\n",
"\n",
"RUN_LOCALLY = False\n",
"ROOT_DIR = os.path.dirname(os.path.abspath(__file__)) + '/'\n",
"TEST_DIR = ROOT_DIR + 'test/'\n",
"DATA_DIR = ROOT_DIR + 'data/fashion'\n",
"VIS_DIR = ROOT_DIR + 'visualization/'\n",
"MODEL_SAVE_DIR = ROOT_DIR + 'save/'\n",
"MULTI_TASK_MODEL = '20170814-153653'\n",
"TEST_DATA_DIR = TEST_DIR + 'data/'\n",
"LOG_DIR = ROOT_DIR + 'log/'\n",
"RESULT_DIR = ROOT_DIR + 'result/'\n",
"TEMPLATE_DIR = ROOT_DIR + 'templates/'\n",
"STATIC_DIR = ROOT_DIR + 'static/'\n",
"SCRIPT_DIR = ROOT_DIR + 'script/'\n",
"BASELINE_PATH = ROOT_DIR + 'benchmark/baselines.json'\n",
"\n",
"Q2A_SUFFIX = '-merged-ad1-20170501+36D+20170605.json.gz'\n",
"\n",
"SYNC_SCRIPT_PATH = SCRIPT_DIR + 'sync_s3.sh'\n",
"DOWNLOAD_SCRIPT_PATH = SCRIPT_DIR + 'load_s3_json.sh'\n",
"LOG_PATH = LOG_DIR + APP_NAME + '.log'\n",
"RESULT_PATH = RESULT_DIR + APP_NAME + '.json'\n",
"\n",
"Q2A_PATH = DATA_DIR + \"query2brand-train.tfr\"\n",
"Q2A_INFO = DATA_DIR + \"query2brand.json\"\n",
"MAX_ITEM_PER_ATTRIBUTE = 20\n",
"\n",
"LOSS_JITTER = 1e-4\n",
"SYNC_INTERVAL = 300.0 # sync every 5 minutes\n",
"SYNC_TIMEOUT = 600\n",
"FIRST_SYNC_DELAY = 300.0 # do the first task only after 5 minutes.\n",
"\n",
"RNN_ARGS_JSON = ROOT_DIR + 'nn/queryclf/config.json'\n",
"\n",
"Q2A_JSON_AKEY1 = 'attributes'\n",
"Q2A_JSON_AKEY2 = 'value'\n",
"\n",
"\n",
"def touch(fname: str, times=None, create_dirs: bool = False):\n",
" if create_dirs:\n",
" base_dir = os.path.dirname(fname)\n",
" if not os.path.exists(base_dir):\n",
" os.makedirs(base_dir)\n",
" with open(fname, 'a'):\n",
" os.utime(fname, times)\n",
"\n",
"\n",
"def touch_dir(base_dir: str) -> None:\n",
" if not os.path.exists(base_dir):\n",
" os.makedirs(base_dir)\n",
"\n",
"\n",
"def _get_logger(name: str):\n",
" import logging.handlers\n",
" touch(LOG_PATH, create_dirs=True)\n",
" touch_dir(MODEL_SAVE_DIR)\n",
" l = logging.getLogger(name)\n",
" l.setLevel(logging.DEBUG)\n",
" fh = logging.FileHandler(LOG_PATH)\n",
" fh.setLevel(logging.INFO)\n",
" ch = logging.StreamHandler()\n",
" ch.setLevel(logging.INFO)\n",
" fh.setFormatter(logging.Formatter(LOG_FORMAT))\n",
" ch.setFormatter(logging.Formatter(LOG_FORMAT))\n",
" l.addHandler(fh)\n",
" l.addHandler(ch)\n",
" return l\n",
"\n",
"\n",
"def get_json_logger(name: str):\n",
" import logging.handlers\n",
" touch(RESULT_PATH, create_dirs=True)\n",
" l = logging.getLogger(__name__ + name)\n",
" l.setLevel(logging.INFO)\n",
" # add rotator to the logger. it's lazy in the sense that it wont rotate unless there are new logs\n",
" fh = logging.FileHandler(RESULT_PATH)\n",
" fh.setLevel(logging.INFO)\n",
" fh.setFormatter(logging.Formatter(JSON_FORMAT))\n",
" l.addHandler(fh)\n",
" return l\n",
"\n",
"\n",
"LOGGER = _get_logger(__name__)\n",
"JSON_LOGGER = get_json_logger('json' + __name__)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"\n",
"# data preprocessing\n",
"(x_train, y_train), (x_test, y_test) = mnist.load_data()\n",
"img_rows, img_cols = 28,28\n",
"x_train = x_train.reshape(x_train.shape[0], img_rows, img_cols, 1)\n",
"x_test = x_test.reshape(x_test.shape[0], img_rows, img_cols,...