{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# These pandas tips will save you hours of head scratching"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"To Step Up Your Pandas Game, read:\n",
"- [5 lesser-known pandas tricks](https://towardsdatascience.com/5-lesser-known-pandas-tricks-e8ab1dd21431)\n",
"- [Exploratory Data Analysis with pandas](https://towardsdatascience.com/exploratory-data-analysis-with-pandas-508a5e8a5964)\n",
"- [How NOT to write pandas code](https://towardsdatascience.com/how-not-to-write-pandas-code-ef88599c6e8f)\n",
"- [These pandas tips will save you hours of headscratching](https://)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"System\n",
"os name: posix\n",
"system: Darwin\n",
"release: 18.7.0\n",
"\n",
"Python\n",
"version: 3.7.3\n",
"\n",
"Python Packages\n",
"jupterlab==1.1.5\n",
"pandas==0.25.3\n",
"numpy==1.17.4\n"
]
}
],
"source": [
"import os\n",
"import platform\n",
"from platform import python_version\n",
"\n",
"import jupyterlab\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"print(\"System\")\n",
"print(\"os name: %s\" % os.name)\n",
"print(\"system: %s\" % platform.system())\n",
"print(\"release: %s\" % platform.release())\n",
"print()\n",
"print(\"Python\")\n",
"print(\"version: %s\" % python_version())\n",
"print()\n",
"print(\"Python Packages\")\n",
"print(\"jupterlab==%s\" % jupyterlab.__version__)\n",
"print(\"pandas==%s\" % pd.__version__)\n",
"print(\"numpy==%s\" % np.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,\n",
" 0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.random_sample(10)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.02058449, 0.96990985, 0.83244264, 0.21233911, 0.18182497,\n",
" 0.18340451, 0.30424224, 0.52475643, 0.43194502, 0.29122914])"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.random_sample(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"What happens when we set the same seed again?\n",
"We reset the seed and we get the same sequence of numbers as above.\n",
"This makes deterministic pseudorandom number generator."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"np.random.seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([0.37454012, 0.95071431, 0.73199394, 0.59865848, 0.15601864,\n",
" 0.15599452, 0.05808361, 0.86617615, 0.60111501, 0.70807258])"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"np.random.random_sample(10)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" col1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" a | \n",
"
\n",
" \n",
" 2 | \n",
" b | \n",
"
\n",
" \n",
" 3 | \n",
" c | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" col1\n",
"1 a\n",
"2 b\n",
"3 c\n",
"4 d\n",
"4 d"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.DataFrame(index=[1, 2, 3, 4, 4], data={\"col1\": [\"a\", \"b\", \"c\", \"d\", \"d\"]})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"ename": "AssertionError",
"evalue": "Dataframe has duplicates",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m\u001b[0m in \u001b[0;36m\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32massert\u001b[0m \u001b[0mlen\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0mdf\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mindex\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mduplicated\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m \u001b[0;34m==\u001b[0m \u001b[0;36m0\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0;34m\"Dataframe has duplicates\"\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mAssertionError\u001b[0m: Dataframe has duplicates"
]
}
],
"source": [
"assert len(df[df.index.duplicated()]) == 0, \"Dataframe has duplicates\""
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(5, 1)"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7, 2)"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new = df.join(df, lsuffix='_l', rsuffix='_r')\n",
"df_new.shape"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" col1_l | \n",
" col1_r | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" a | \n",
" a | \n",
"
\n",
" \n",
" 2 | \n",
" b | \n",
" b | \n",
"
\n",
" \n",
" 3 | \n",
" c | \n",
" c | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
" d | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" col1_l col1_r\n",
"1 a a\n",
"2 b b\n",
"3 c c\n",
"4 d d\n",
"4 d d\n",
"4 d d\n",
"4 d d"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_new"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" col1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" a | \n",
"
\n",
" \n",
" 2 | \n",
" b | \n",
"
\n",
" \n",
" 3 | \n",
" c | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" col1\n",
"1 a\n",
"2 b\n",
"3 c\n",
"4 d\n",
"4 d"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" col1 | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" a | \n",
"
\n",
" \n",
" 2 | \n",
" b | \n",
"
\n",
" \n",
" 3 | \n",
" c | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
"
\n",
" \n",
" 4 | \n",
" d | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" col1\n",
"1 a\n",
"2 b\n",
"3 c\n",
"4 d\n",
"4 d"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_clipboard(sep='\\s\\s+')\n",
"df"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}