{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# How NOT to write pandas code\n", "\n", "To Step Up Your Pandas Game, read: \n", "- [5 lesser-known pandas tricks](https://towardsdatascience.com/5-lesser-known-pandas-tricks-e8ab1dd21431)\n", "- [Exploratory Data Analysis with pandas](https://towardsdatascience.com/exploratory-data-analysis-with-pandas-508a5e8a5964)" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "from platform import python_version\n", "\n", "import numpy as np\n", "import pandas as pd" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42) # set the seed to make examples repeatable" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "python version==3.7.3\n", "pandas==0.25.3\n", "numpy==1.17.4\n" ] } ], "source": [ "print(\"python version==%s\" % python_version())\n", "print(\"pandas==%s\" % pd.__version__)\n", "print(\"numpy==%s\" % np.__version__)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcitybooked_perc
00-berlinberlin0.393636
11-new yorknew york0.473436
22-parisparis0.854547
33-berlinberlin0.340004
44-berlinberlin0.869650
\n", "
" ], "text/plain": [ " id city booked_perc\n", "0 0-berlin berlin 0.393636\n", "1 1-new york new york 0.473436\n", "2 2-paris paris 0.854547\n", "3 3-berlin berlin 0.340004\n", "4 4-berlin berlin 0.869650" ] }, "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], "source": [ "size = 10000\n", "\n", "cities = [\"paris\", \"barcelona\", \"berlin\", \"new york\"]\n", "\n", "df = pd.DataFrame(\n", " {\"city\": np.random.choice(cities, size=size), \"booked_perc\": np.random.rand(size)}\n", ")\n", "df[\"id\"] = df.index.map(str) + \"-\" + df.city\n", "df = df[[\"id\", \"city\", \"booked_perc\"]]\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "766 ms ± 20.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit\n", "\n", "suma = 0\n", "for _, row in df.iterrows():\n", " suma += row.booked_perc" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "989 µs ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "sum(booked_perc for booked_perc in df.booked_perc)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "92 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "df.booked_perc.sum()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "831 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit\n", "\n", "suma = 0\n", "for _, row in df.iterrows():\n", " if row.booked_perc <= 0.5:\n", " suma += row.booked_perc" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "724 µs ± 18.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], "source": [ "%%timeit\n", "df[df.booked_perc <= 0.5].booked_perc.sum()" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "1.55 ms ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "df[(df.booked_perc <= 0.5) & (df.city == 'new york')].booked_perc.sum()" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "7.02 s ± 24.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit\n", "\n", "for i in range(1, len(df)):\n", " df.loc[i, \"perc_change\"] = (df.loc[i].booked_perc - df.loc[i - 1].booked_perc) / df.loc[i - 1].booked_perc" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "586 µs ± 17.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "df[\"perc_change\"] = df.booked_perc.pct_change()" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.58 s ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit\n", "\n", "for i, row in df.iterrows():\n", " if row.city == 'new york':\n", " df.loc[i, 'sales_factor'] = row.booked_perc * 2\n", " else:\n", " df.loc[i, 'sales_factor'] = 0" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "165 ms ± 2.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "def calculate_sales_factor(row):\n", " if row.city == 'new york':\n", " return row.booked_perc * 2\n", " return 0\n", "\n", "df['sales_factor'] = df.apply(calculate_sales_factor, axis=1)" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.03 ms ± 85.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit \n", "\n", "df.loc[df.city == 'new york', 'sales_factor'] = df[df.city == 'new york'].booked_perc * 2\n", "df.sales_factor.fillna(0, inplace=True)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "878 ms ± 21.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n" ] } ], "source": [ "%%timeit \n", "\n", "avg_by_city = {}\n", "count_by_city = {}\n", "first_booking_by_city = {}\n", "\n", "for i, row in df.iterrows():\n", " city = row.city\n", " if city in avg_by_city:\n", " avg_by_city[city] += row.sales_factor\n", " count_by_city[city] += 1\n", " else:\n", " avg_by_city[city] = row.sales_factor\n", " count_by_city[city] = 1\n", " first_booking_by_city[city] = row['id']\n", "\n", "for city, _ in avg_by_city.items():\n", " avg_by_city[city] /= count_by_city[city]" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "3.05 ms ± 65.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "df.groupby('city').sales_factor.mean()\n", "df.groupby('city').sales_factor.count()\n", "df.groupby('city').id.first()" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "4.5 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n" ] } ], "source": [ "%%timeit\n", "\n", "df.groupby(\"city\").agg({\"sales_factor\": [\"mean\", \"count\"], \"id\": \"first\"})" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Surprisingly, the third example is not the fastest, but it is more concise than the second example.\n", "I would suggest you use the second approach when you need to speed up your code." ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }