{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# These pandas tips will save you hours of head scratching" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "To Step Up Your Pandas Game, read:\n", "- [5 lesser-known pandas tricks](https://towardsdatascience.com/5-lesser-known-pandas-tricks-e8ab1dd21431)\n", "- [Exploratory Data Analysis with pandas](https://towardsdatascience.com/exploratory-data-analysis-with-pandas-508a5e8a5964)\n", "- [How NOT to write pandas code](https://towardsdatascience.com/how-not-to-write-pandas-code-ef88599c6e8f)\n", "- [These pandas tips will save you hours of headscratching](https://)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "System\n", "os name: posix\n", "system: Darwin\n", "release: 18.7.0\n", "\n", "Python\n", "version: 3.7.3\n", "\n", "Python Packages\n", "jupterlab==1.1.5\n", "pandas==0.25.3\n", "numpy==1.17.4\n" ] } ], "source": [ "import os\n", "import platform\n", "from platform import python_version\n", "\n", "import jupyterlab\n", "import numpy as np\n", "import pandas as pd\n", "\n", "print(\"System\")\n", "print(\"os name: %s\" % os.name)\n", "print(\"system: %s\" % platform.system())\n", "print(\"release: %s\" % platform.release())\n", "print()\n", "print(\"Python\")\n", "print(\"version: %s\" % python_version())\n", "print()\n", "print(\"Python Packages\")\n", "print(\"jupterlab==%s\" % jupyterlab.__version__)\n", "print(\"pandas==%s\" % pd.__version__)\n", "print(\"numpy==%s\" % np.__version__)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,\n", " 0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.random_sample(10)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.02058449, 0.96990985, 0.83244264, 0.21233911, 0.18182497,\n", " 0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914])" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.random_sample(10)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "What happens when we set the same seed again?\n", "We reset the seed and we get the same sequence of numbers as above.\n", "This makes deterministic pseudorandom number generator." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,\n", " 0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "np.random.random_sample(10)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1
1a
2b
3c
4d
4d
\n", "
" ], "text/plain": [ " col1\n", "1 a\n", "2 b\n", "3 c\n", "4 d\n", "4 d" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.DataFrame(index=[1, 2, 3, 4, 4], data={\"col1\": [\"a\", \"b\", \"c\", \"d\", \"d\"]})\n", "df" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "ename": "AssertionError", "evalue": "Dataframe has duplicates", "output_type": "error", "traceback": [ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)", "\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mduplicated\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Dataframe has duplicates\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m", "\u001b[0;31mAssertionError\u001b[0m: Dataframe has duplicates" ] } ], "source": [ "assert len(df[df.index.duplicated()]) == 0, \"Dataframe has duplicates\"" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(5, 1)" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.shape" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7, 2)" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new = df.join(df, lsuffix='_l', rsuffix='_r')\n", "df_new.shape" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1_lcol1_r
1aa
2bb
3cc
4dd
4dd
4dd
4dd
\n", "
" ], "text/plain": [ " col1_l col1_r\n", "1 a a\n", "2 b b\n", "3 c c\n", "4 d d\n", "4 d d\n", "4 d d\n", "4 d d" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_new" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1
1a
2b
3c
4d
4d
\n", "
" ], "text/plain": [ " col1\n", "1 a\n", "2 b\n", "3 c\n", "4 d\n", "4 d" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
col1
1a
2b
3c
4d
4d
\n", "
" ], "text/plain": [ " col1\n", "1 a\n", "2 b\n", "3 c\n", "4 d\n", "4 d" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_clipboard(sep='\\s\\s+')\n", "df" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }