{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Regression\n", "In this example we are building a model that predicts house prices in Boston \n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/dan0nchik/SAP-HANA-AutoML/blob/dev/docs/source/regression.ipynb)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Install modules" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "# !pip3 install Cython\n", "# !pip3 install hana_automl" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "try:\n", " from hana_automl.automl import AutoML\n", " import pandas as pd\n", " from hana_ml.dataframe import ConnectionContext\n", " from hana_automl.storage import Storage\n", "except ImportError:\n", " sys.exit(\"\"\"You need to install hana_automl and pandas. Uncomment cell above\"\"\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Let's get used to the dataset" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDcrimzninduschasnoxrmagedisradtaxptratioblacklstatmedv
000.158760.010.810.00.4135.96117.55.28734.0305.019.2376.949.8821.7
110.1032825.05.130.00.4535.92747.26.93208.0284.019.7396.909.2219.6
220.349400.09.900.00.5445.97276.73.10254.0304.018.4396.249.9720.3
332.733970.019.580.00.8715.59794.91.52575.0403.014.7351.8521.4515.4
440.0433721.05.640.00.4396.11563.06.81474.0243.016.8393.979.4320.5
\n", "
" ], "text/plain": [ " ID crim zn indus chas nox rm age dis rad tax \\\n", "0 0 0.15876 0.0 10.81 0.0 0.413 5.961 17.5 5.2873 4.0 305.0 \n", "1 1 0.10328 25.0 5.13 0.0 0.453 5.927 47.2 6.9320 8.0 284.0 \n", "2 2 0.34940 0.0 9.90 0.0 0.544 5.972 76.7 3.1025 4.0 304.0 \n", "3 3 2.73397 0.0 19.58 0.0 0.871 5.597 94.9 1.5257 5.0 403.0 \n", "4 4 0.04337 21.0 5.64 0.0 0.439 6.115 63.0 6.8147 4.0 243.0 \n", "\n", " ptratio black lstat medv \n", "0 19.2 376.94 9.88 21.7 \n", "1 19.7 396.90 9.22 19.6 \n", "2 18.4 396.24 9.97 20.3 \n", "3 14.7 351.85 21.45 15.4 \n", "4 16.8 393.97 9.43 20.5 " ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "test_df = pd.read_csv('https://raw.githubusercontent.com/dan0nchik/SAP-HANA-AutoML/dev/docs/source/datasets/boston_test_data.csv')\n", "df = pd.read_csv('https://raw.githubusercontent.com/dan0nchik/SAP-HANA-AutoML/dev/docs/source/datasets/boston_data.csv')\n", "df.head()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Pass credentials to the database." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "# Replace with your credentials\n", "cc = ConnectionContext(address='address', \n", " port=39015, # default for most databases. Details here: https://help.sap.com/viewer/0eec0d68141541d1b07893a39944924e/2.0.03/en-US/b250e7fef8614ea0a0973d58eb73bda8.html\n", " user='user',\n", " password='password')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "automl = AutoML(connection_context=cc)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "automl.fit(\n", " df=df,\n", " task=None, # library will try to determine task\n", " steps=10,\n", " target='medv',\n", " table_name='REGRESSION', # optional\n", " id_column='ID', # pass None if no ID column in dataset\n", " verbose=1\n", ")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Save model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "storage = Storage(connection_context=cc, schema='DEVELOPER')\n", "automl.model.name = \"boston\" # don't forget to specify the name\n", "storage.save_model(automl=automl)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NAMEVERSIONLIBRARYCLASSJSONTIMESTAMPMODEL_STORAGE_VER
0boston1PALhana_ml.algorithms.pal.trees.HybridGradientBoo...{\"model_attributes\": {\"n_estimators\": 541, \"ra...2021-05-29 17:19:091
\n", "
" ], "text/plain": [ " NAME VERSION LIBRARY CLASS \\\n", "0 boston 1 PAL hana_ml.algorithms.pal.trees.HybridGradientBoo... \n", "\n", " JSON TIMESTAMP \\\n", "0 {\"model_attributes\": {\"n_estimators\": 541, \"ra... 2021-05-29 17:19:09 \n", "\n", " MODEL_STORAGE_VER \n", "0 1 " ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "storage.list_models()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Load model and predict" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Creating table with name: AUTOML6b526c36-6c5e-459b-b5ec-92cf44f78b15\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 1/1 [00:00<00:00, 6.94it/s]\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Preprocessor settings: median\n", "Prediction results (first 20 rows): \n", " ID SCORE CONFIDENCE\n", "0 1 35.29534448792848 None\n", "1 2 22.489733948983936 None\n", "2 3 13.713093051897628 None\n", "3 4 24.172307331613972 None\n", "4 5 20.248966896747383 None\n", "5 6 23.39085017433158 None\n", "6 7 20.549891594531058 None\n", "7 8 16.66822914049735 None\n", "8 9 18.383664287056316 None\n", "9 10 46.003539073294455 None\n", "10 11 40.11289274119828 None\n", "11 12 11.69618986227771 None\n", "12 13 12.88755355977079 None\n", "13 14 35.486779348478116 None\n", "14 15 20.358130558843374 None\n", "15 16 9.478611548228256 None\n", "16 17 20.84342879503455 None\n", "17 18 20.269377197301793 None\n", "18 19 20.410181244909623 None\n", "19 20 10.344656992114738 None\n" ] }, { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
IDSCORECONFIDENCE
0135.29534448792848None
1222.489733948983936None
2313.713093051897628None
3424.172307331613972None
4520.248966896747383None
............
979823.80490001240718None
989922.24186449851454None
9910018.38892297314412None
10010138.51662147490824None
10110246.306857840422325None
\n", "

102 rows × 3 columns

\n", "
" ], "text/plain": [ " ID SCORE CONFIDENCE\n", "0 1 35.29534448792848 None\n", "1 2 22.489733948983936 None\n", "2 3 13.713093051897628 None\n", "3 4 24.172307331613972 None\n", "4 5 20.248966896747383 None\n", ".. ... ... ...\n", "97 98 23.80490001240718 None\n", "98 99 22.24186449851454 None\n", "99 100 18.38892297314412 None\n", "100 101 38.51662147490824 None\n", "101 102 46.306857840422325 None\n", "\n", "[102 rows x 3 columns]" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "new_model = storage.load_model('boston', version=1)\n", "new_model.predict(df=test_df)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Cleanup storage" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "storage.clean_up()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For more information, visit AutoML class and Storage class in documentation" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" } }, "nbformat": 4, "nbformat_minor": 2 }