{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "94169c2a", "metadata": {}, "outputs": [], "source": [ "import os \n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' \n", "import numpy as np\n", "from sklearn.preprocessing import StandardScaler\n", "import tensorflow as tf\n", "tf.compat.v1.disable_eager_execution() \n", "import tensorflow_probability as tfp\n", "import matplotlib.pyplot as plt\n", "from data_loader import load_data\n", "from data_preprocesser import preprocess_data\n", "from planar import Planar" ] }, { "cell_type": "code", "execution_count": 2, "id": "087e74b6", "metadata": {}, "outputs": [], "source": [ "def train(session, loss, optimizer, steps=int(1e5)):\n", " \n", " \"\"\" optimize for all dimensions \"\"\"\n", " \n", " recorded_steps = []\n", " recorded_losses = []\n", " for i in range(steps):\n", " _, loss_per_iteration = session.run([optimizer, loss])\n", " if i % 100 == 0:\n", " recorded_steps.append(i)\n", " recorded_losses.append(loss_per_iteration)\n", " if i % int(1e4) == 0:\n", " print('Iteration {iteration}: {loss}'.format(iteration=i,loss=loss_per_iteration))\n", " return recorded_losses\n", "\n", "def plot_results(recorded_losses):\n", " \n", " \"\"\" plot loss \"\"\"\n", " print('Displaying results...')\n", " fig = plt.figure(figsize=(10,5))\n", " x = np.arange(len(recorded_losses))\n", " y = recorded_losses\n", " m, b = np.polyfit(x, y, 1) \n", " plt.scatter(x, y, s=10, alpha=0.3)\n", " plt.plot(x, m*x+b, c=\"r\")\n", " plt.title('Loss per 100 iteration')\n", " plt.xlabel('Iteration')\n", " plt.ylabel('Loss')\n", " plt.tight_layout()\n", " plt.show()\n", " \n", "def create_tensor(data, batch_size):\n", " dataset = tf.data.Dataset.from_tensor_slices(data.astype(np.float32))\n", " dataset = dataset.repeat()\n", " dataset = dataset.shuffle(buffer_size=data.shape[0])\n", " dataset = dataset.prefetch(2*batch_size)\n", " dataset = dataset.batch(batch_size)\n", " data_iterator = tf.compat.v1.data.make_one_shot_iterator(dataset)\n", " samples = data_iterator.get_next()\n", " return samples\n", "\n", "\"\"\" \n", "if any error on tensorflow is displayed claiming tf.float32 is not displayed,\n", "do the following (one of them is probably enough)\n", " ** downgrade keras to 2.3.1\n", " ** replace tf.float32 with np.float32\n", "\"\"\"\n", "def check_version(): \n", " print(f'Tensorflow version: {tf.__version__}')\n", " print(f'Tensorflow-probability version: {tfp.__version__}')\n", " print(f'Keras version: {tf.keras.__version__}\\n')" ] }, { "cell_type": "code", "execution_count": null, "id": "23f55548", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Data successfully loaded...\n", "\n", "Data successfully preprocessed...\n", "\n", "Optimizer and loss successfully defined...\n", "\n", "Iteration 0: 95.97050476074219\n", "Iteration 10000: 57.572265625\n", "Iteration 20000: 49.272705078125\n", "Iteration 30000: 46.765769958496094\n", "Iteration 40000: 46.634979248046875\n", "Iteration 50000: 44.979713439941406\n" ] } ], "source": [ "def main():\n", " \n", " \"\"\" load data \"\"\"\n", "\n", " filename = 'prostate.xls'\n", " directory = '/Users/kaanguney.keklikci/Data/'\n", "\n", " loader = load_data(filename, directory)\n", " loader.create_directory(directory)\n", " data = loader.read_data(directory, filename)\n", " print('Data successfully loaded...\\n')\n", " \n", " \"\"\" preprocess data \"\"\"\n", "\n", " fillna_vals = ['sz', 'sg', 'wt']\n", " dropna_vals = ['ekg', 'age']\n", " drop_vals = ['patno', 'sdate']\n", "\n", " preprocesser = preprocess_data(StandardScaler(), fillna_vals, dropna_vals, drop_vals)\n", " data = preprocesser.dropna_features(data)\n", " data = preprocesser.impute(data)\n", " data = preprocesser.drop_features(data)\n", " data = preprocesser.encode_categorical(data)\n", " data = preprocesser.scale(data)\n", " print('Data successfully preprocessed...\\n')\n", " \n", " \"\"\" set Planar parameters \"\"\"\n", " \n", " tfd = tfp.distributions\n", " tfb = tfp.bijectors\n", "\n", " batch_size = 32\n", " dtype = np.float32\n", " layers = 8\n", " dims = data.shape[1]\n", " # multivariate normal for base distribution\n", " base_dist = tfd.MultivariateNormalDiag(loc=tf.zeros(shape=dims, dtype=dtype))\n", " learning_rate = 1e-4\n", " \n", " \"\"\" initialize samples \"\"\"\n", " samples = create_tensor(data, batch_size)\n", " \n", " \"\"\" make Planar \"\"\"\n", "\n", " bijectors = []\n", " for i in range(0, layers):\n", " bijectors.append(Planar(input_dimensions=dims, case='density_estimation'))\n", " bijector = tfb.Chain(bijectors=list(reversed(bijectors)), name='chain_of_planar')\n", " planar_flow = tfd.TransformedDistribution(\n", " distribution=base_dist,\n", " bijector=bijector\n", " )\n", "\n", " loss = -tf.reduce_mean(planar_flow.log_prob(samples))\n", " optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate).minimize(loss)\n", "\n", " session = tf.compat.v1.Session()\n", " tf.compat.v1.set_random_seed(42)\n", " session.run(tf.compat.v1.global_variables_initializer())\n", " print('Optimizer and loss successfully defined...\\n')\n", " \n", " \"\"\" start training \"\"\"\n", " recorded_losses = train(session, loss, optimizer)\n", " print('Training finished...\\n')\n", " \n", " \"\"\" display results \"\"\"\n", " plot_results(recorded_losses)\n", " \n", " \n", "if __name__ == \"__main__\":\n", " main()" ] }, { "cell_type": "code", "execution_count": null, "id": "0d33af87", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.10" } }, "nbformat": 4, "nbformat_minor": 5 }