{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# How NOT to write pandas code\n",
    "\n",
    "To Step Up Your Pandas Game, read: \n",
    "- [5 lesser-known pandas tricks](https://towardsdatascience.com/5-lesser-known-pandas-tricks-e8ab1dd21431)\n",
    "- [Exploratory Data Analysis with pandas](https://towardsdatascience.com/exploratory-data-analysis-with-pandas-508a5e8a5964)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from platform import python_version\n",
    "\n",
    "import numpy as np\n",
    "import pandas as pd"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "np.random.seed(42) # set the seed to make examples repeatable"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "python version==3.7.3\n",
      "pandas==0.25.3\n",
      "numpy==1.17.4\n"
     ]
    }
   ],
   "source": [
    "print(\"python version==%s\" % python_version())\n",
    "print(\"pandas==%s\" % pd.__version__)\n",
    "print(\"numpy==%s\" % np.__version__)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>id</th>\n",
       "      <th>city</th>\n",
       "      <th>booked_perc</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>0-berlin</td>\n",
       "      <td>berlin</td>\n",
       "      <td>0.393636</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1-new york</td>\n",
       "      <td>new york</td>\n",
       "      <td>0.473436</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>2-paris</td>\n",
       "      <td>paris</td>\n",
       "      <td>0.854547</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>3-berlin</td>\n",
       "      <td>berlin</td>\n",
       "      <td>0.340004</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>4-berlin</td>\n",
       "      <td>berlin</td>\n",
       "      <td>0.869650</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "           id      city  booked_perc\n",
       "0    0-berlin    berlin     0.393636\n",
       "1  1-new york  new york     0.473436\n",
       "2     2-paris     paris     0.854547\n",
       "3    3-berlin    berlin     0.340004\n",
       "4    4-berlin    berlin     0.869650"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "size = 10000\n",
    "\n",
    "cities = [\"paris\", \"barcelona\", \"berlin\", \"new york\"]\n",
    "\n",
    "df = pd.DataFrame(\n",
    "    {\"city\": np.random.choice(cities, size=size), \"booked_perc\": np.random.rand(size)}\n",
    ")\n",
    "df[\"id\"] = df.index.map(str) + \"-\" + df.city\n",
    "df = df[[\"id\", \"city\", \"booked_perc\"]]\n",
    "df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "766 ms ± 20.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "suma = 0\n",
    "for _, row in df.iterrows():\n",
    "    suma += row.booked_perc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "989 µs ± 18.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "sum(booked_perc for booked_perc in df.booked_perc)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "92 µs ± 2.21 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "df.booked_perc.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "831 ms ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "suma = 0\n",
    "for _, row in df.iterrows():\n",
    "    if row.booked_perc <= 0.5:\n",
    "        suma += row.booked_perc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "724 µs ± 18.8 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "df[df.booked_perc <= 0.5].booked_perc.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1.55 ms ± 10.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "df[(df.booked_perc <= 0.5) & (df.city == 'new york')].booked_perc.sum()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "7.02 s ± 24.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "for i in range(1, len(df)):\n",
    "    df.loc[i, \"perc_change\"] =  (df.loc[i].booked_perc - df.loc[i - 1].booked_perc) / df.loc[i - 1].booked_perc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "586 µs ± 17.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "df[\"perc_change\"] = df.booked_perc.pct_change()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.58 s ± 48.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "for i, row in df.iterrows():\n",
    "    if row.city == 'new york':\n",
    "        df.loc[i, 'sales_factor'] = row.booked_perc * 2\n",
    "    else:\n",
    "        df.loc[i, 'sales_factor'] = 0"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "165 ms ± 2.48 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "def calculate_sales_factor(row):\n",
    "    if row.city == 'new york':\n",
    "        return row.booked_perc * 2\n",
    "    return 0\n",
    "\n",
    "df['sales_factor'] = df.apply(calculate_sales_factor, axis=1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.03 ms ± 85.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "\n",
    "df.loc[df.city == 'new york', 'sales_factor'] = df[df.city == 'new york'].booked_perc * 2\n",
    "df.sales_factor.fillna(0, inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "878 ms ± 21.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit \n",
    "\n",
    "avg_by_city = {}\n",
    "count_by_city = {}\n",
    "first_booking_by_city = {}\n",
    "\n",
    "for i, row in df.iterrows():\n",
    "    city = row.city\n",
    "    if city in avg_by_city:\n",
    "        avg_by_city[city] += row.sales_factor\n",
    "        count_by_city[city] += 1\n",
    "    else:\n",
    "        avg_by_city[city] = row.sales_factor\n",
    "        count_by_city[city] = 1\n",
    "        first_booking_by_city[city] = row['id']\n",
    "\n",
    "for city, _ in avg_by_city.items():\n",
    "    avg_by_city[city] /= count_by_city[city]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "3.05 ms ± 65.3 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "df.groupby('city').sales_factor.mean()\n",
    "df.groupby('city').sales_factor.count()\n",
    "df.groupby('city').id.first()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "4.5 ms ± 131 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)\n"
     ]
    }
   ],
   "source": [
    "%%timeit\n",
    "\n",
    "df.groupby(\"city\").agg({\"sales_factor\": [\"mean\", \"count\"], \"id\": \"first\"})"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Surprisingly, the third example is not the fastest, but it is more concise than the second example.\n",
    "I would suggest you use the second approach when you need to speed up your code."
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}