{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Few Gotchas with pandas \n",
"\n",
"Doing Data Analysis with pandas is pretty straightforward until it is NOT! In this blog post, I present a few gotchas with pandas that you don't need to repeat.\n",
"\n",
"To run the examples download this Jupyter notebook.\n",
"\n",
"To Step Up Your Pandas Game, read:\n",
"- [5 lesser-known pandas tricks](https://towardsdatascience.com/5-lesser-known-pandas-tricks-e8ab1dd21431),\n",
"- [Exploratory Data Analysis with pandas](https://towardsdatascience.com/exploratory-data-analysis-with-pandas-508a5e8a5964),\n",
"- [How NOT to write pandas code](https://towardsdatascience.com/how-not-to-write-pandas-code-ef88599c6e8f)."
]
},
{
"cell_type": "code",
"execution_count": 22,
"metadata": {},
"outputs": [],
"source": [
"%matplotlib inline"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"import glob\n",
"from platform import python_version\n",
"\n",
"import pandas as pd\n",
"import matplotlib as mpl"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [],
"source": [
"mpl.pyplot.rcParams['figure.facecolor'] = 'w'"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"python version==3.7.3\n",
"pandas==0.25.3\n",
"matplotlib==3.1.2\n"
]
}
],
"source": [
"print(\"python version==%s\" % python_version())\n",
"print(\"pandas==%s\" % pd.__version__)\n",
"print(\"matplotlib==%s\" % mpl.__version__)"
]
},
{
"cell_type": "code",
"execution_count": 26,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['data/20191210.csv.gz', 'data/20191209.csv.gz', 'data/20191211.csv.gz']"
]
},
"execution_count": 26,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filepaths = glob.glob('data/*.gz')\n",
"filepaths"
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['data/20191209.csv.gz', 'data/20191210.csv.gz', 'data/20191211.csv.gz']"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"filepaths = sorted(filepaths)\n",
"filepaths"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"['100', '1000', '200', '20000', '9']"
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted(['100', '200', '1000', '20000', '9'])"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[9, 100, 200, 1000, 20000]"
]
},
"execution_count": 29,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"sorted([100, 200, 1000, 20000, 9])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1578327, 10)"
]
},
"execution_count": 30,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df_list = []\n",
"for filepath in filepaths:\n",
" df_ = pd.read_csv(filepath)\n",
" df_list.append(df_)\n",
"\n",
"df = pd.concat(df_list)\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" timestamp | \n",
" symbol | \n",
" side | \n",
" size | \n",
" price | \n",
" tickDirection | \n",
" trdMatchID | \n",
" grossValue | \n",
" homeNotional | \n",
" foreignNotional | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2019-12-09D00:00:18.265072000 | \n",
" ADAZ19 | \n",
" Buy | \n",
" 5319 | \n",
" 0.000005 | \n",
" ZeroPlusTick | \n",
" 7317fb7d-1bec-af18-e030-d8d6d9c684ac | \n",
" 2749923 | \n",
" 5319.0 | \n",
" 0.027499 | \n",
"
\n",
" \n",
" 1 | \n",
" 2019-12-09D00:00:42.503391000 | \n",
" ADAZ19 | \n",
" Sell | \n",
" 10000 | \n",
" 0.000005 | \n",
" MinusTick | \n",
" 43a3c08b-2c86-20a4-9722-f59f0a3e12c8 | \n",
" 5160000 | \n",
" 10000.0 | \n",
" 0.051600 | \n",
"
\n",
" \n",
" 2 | \n",
" 2019-12-09D00:01:50.513620000 | \n",
" ADAZ19 | \n",
" Buy | \n",
" 20000 | \n",
" 0.000005 | \n",
" PlusTick | \n",
" 643f6fea-a72e-f3ac-994f-e923a0c2212c | \n",
" 10340000 | \n",
" 20000.0 | \n",
" 0.103400 | \n",
"
\n",
" \n",
" 3 | \n",
" 2019-12-09D00:02:35.189775000 | \n",
" ADAZ19 | \n",
" Buy | \n",
" 18543 | \n",
" 0.000005 | \n",
" ZeroPlusTick | \n",
" 153fe5ef-b974-606e-2321-b3ccff0caee3 | \n",
" 9586731 | \n",
" 18543.0 | \n",
" 0.095867 | \n",
"
\n",
" \n",
" 4 | \n",
" 2019-12-09D00:03:49.934950000 | \n",
" ADAZ19 | \n",
" Buy | \n",
" 1138 | \n",
" 0.000005 | \n",
" ZeroPlusTick | \n",
" dfc5efdd-76fa-9ef7-38bf-ffec4a564530 | \n",
" 588346 | \n",
" 1138.0 | \n",
" 0.005883 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" timestamp symbol side size price tickDirection \\\n",
"0 2019-12-09D00:00:18.265072000 ADAZ19 Buy 5319 0.000005 ZeroPlusTick \n",
"1 2019-12-09D00:00:42.503391000 ADAZ19 Sell 10000 0.000005 MinusTick \n",
"2 2019-12-09D00:01:50.513620000 ADAZ19 Buy 20000 0.000005 PlusTick \n",
"3 2019-12-09D00:02:35.189775000 ADAZ19 Buy 18543 0.000005 ZeroPlusTick \n",
"4 2019-12-09D00:03:49.934950000 ADAZ19 Buy 1138 0.000005 ZeroPlusTick \n",
"\n",
" trdMatchID grossValue homeNotional \\\n",
"0 7317fb7d-1bec-af18-e030-d8d6d9c684ac 2749923 5319.0 \n",
"1 43a3c08b-2c86-20a4-9722-f59f0a3e12c8 5160000 10000.0 \n",
"2 643f6fea-a72e-f3ac-994f-e923a0c2212c 10340000 20000.0 \n",
"3 153fe5ef-b974-606e-2321-b3ccff0caee3 9586731 18543.0 \n",
"4 dfc5efdd-76fa-9ef7-38bf-ffec4a564530 588346 1138.0 \n",
"\n",
" foreignNotional \n",
"0 0.027499 \n",
"1 0.051600 \n",
"2 0.103400 \n",
"3 0.095867 \n",
"4 0.005883 "
]
},
"execution_count": 31,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2019-12-09D00:00:18.265072000\n",
"1 2019-12-09D00:00:42.503391000\n",
"2 2019-12-09D00:01:50.513620000\n",
"3 2019-12-09D00:02:35.189775000\n",
"4 2019-12-09D00:03:49.934950000\n",
" ... \n",
"389905 2019-12-11D23:35:26.637330000\n",
"389906 2019-12-11D23:35:26.637330000\n",
"389907 2019-12-11D23:37:25.166632000\n",
"389908 2019-12-11D23:40:09.701271000\n",
"389909 2019-12-11D23:55:46.793512000\n",
"Name: timestamp, Length: 1578327, dtype: object"
]
},
"execution_count": 32,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.timestamp"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 100\n",
"2 1000\n",
"1 200\n",
"3 20000\n",
"4 9\n",
"dtype: object"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.Series(['100', '200', '1000', '20000', '9']).sort_values()"
]
},
{
"cell_type": "code",
"execution_count": 34,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 2019-12-09 00:00:18.265072\n",
"1 2019-12-09 00:00:42.503391\n",
"2 2019-12-09 00:01:50.513620\n",
"3 2019-12-09 00:02:35.189775\n",
"4 2019-12-09 00:03:49.934950\n",
" ... \n",
"389905 2019-12-11 23:35:26.637330\n",
"389906 2019-12-11 23:35:26.637330\n",
"389907 2019-12-11 23:37:25.166632\n",
"389908 2019-12-11 23:40:09.701271\n",
"389909 2019-12-11 23:55:46.793512\n",
"Name: timestamp, Length: 1578327, dtype: datetime64[ns]"
]
},
"execution_count": 34,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.timestamp = pd.to_datetime(df.timestamp.str.replace(\"D\", \"T\"))\n",
"df.timestamp"
]
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(1309671, 10)"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = df[df.symbol == 'XBTUSD']\n",
"df.shape"
]
},
{
"cell_type": "code",
"execution_count": 40,
"metadata": {},
"outputs": [],
"source": [
"df = df.sort_values(\"timestamp\")"
]
},
{
"cell_type": "code",
"execution_count": 41,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" timestamp | \n",
" symbol | \n",
" side | \n",
" size | \n",
" price | \n",
" tickDirection | \n",
" trdMatchID | \n",
" grossValue | \n",
" homeNotional | \n",
" foreignNotional | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 2019-12-09 00:00:03.137637 | \n",
" XBTUSD | \n",
" Buy | \n",
" 500 | \n",
" 7512.5 | \n",
" ZeroPlusTick | \n",
" 084e9218-9596-3457-fee1-3012bb3ce1a6 | \n",
" 6655500 | \n",
" 0.066555 | \n",
" 500.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 2019-12-09 00:00:03.157349 | \n",
" XBTUSD | \n",
" Buy | \n",
" 14 | \n",
" 7512.5 | \n",
" ZeroPlusTick | \n",
" d25b3551-5850-0974-b844-80d591dff4d2 | \n",
" 186354 | \n",
" 0.001864 | \n",
" 14.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 2019-12-09 00:00:03.165542 | \n",
" XBTUSD | \n",
" Sell | \n",
" 5 | \n",
" 7512.0 | \n",
" MinusTick | \n",
" bad21a77-12eb-aed6-809b-8d3090d8efb2 | \n",
" 66560 | \n",
" 0.000666 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 3 | \n",
" 2019-12-09 00:00:03.174043 | \n",
" XBTUSD | \n",
" Sell | \n",
" 383 | \n",
" 7512.0 | \n",
" ZeroMinusTick | \n",
" 57da5cfd-a646-d014-d8b3-3c18e7565abc | \n",
" 5098496 | \n",
" 0.050985 | \n",
" 383.0 | \n",
"
\n",
" \n",
" 4 | \n",
" 2019-12-09 00:00:03.178405 | \n",
" XBTUSD | \n",
" Sell | \n",
" 20 | \n",
" 7512.0 | \n",
" ZeroMinusTick | \n",
" d95cdd09-7784-21e4-b008-192762fbf199 | \n",
" 266240 | \n",
" 0.002662 | \n",
" 20.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" timestamp symbol side size price tickDirection \\\n",
"0 2019-12-09 00:00:03.137637 XBTUSD Buy 500 7512.5 ZeroPlusTick \n",
"1 2019-12-09 00:00:03.157349 XBTUSD Buy 14 7512.5 ZeroPlusTick \n",
"2 2019-12-09 00:00:03.165542 XBTUSD Sell 5 7512.0 MinusTick \n",
"3 2019-12-09 00:00:03.174043 XBTUSD Sell 383 7512.0 ZeroMinusTick \n",
"4 2019-12-09 00:00:03.178405 XBTUSD Sell 20 7512.0 ZeroMinusTick \n",
"\n",
" trdMatchID grossValue homeNotional \\\n",
"0 084e9218-9596-3457-fee1-3012bb3ce1a6 6655500 0.066555 \n",
"1 d25b3551-5850-0974-b844-80d591dff4d2 186354 0.001864 \n",
"2 bad21a77-12eb-aed6-809b-8d3090d8efb2 66560 0.000666 \n",
"3 57da5cfd-a646-d014-d8b3-3c18e7565abc 5098496 0.050985 \n",
"4 d95cdd09-7784-21e4-b008-192762fbf199 266240 0.002662 \n",
"\n",
" foreignNotional \n",
"0 500.0 \n",
"1 14.0 \n",
"2 5.0 \n",
"3 383.0 \n",
"4 20.0 "
]
},
"execution_count": 41,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.reset_index(inplace=True, drop=True)\n",
"df.head()"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" size | \n",
" price | \n",
" grossValue | \n",
" homeNotional | \n",
" foreignNotional | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
"
\n",
" \n",
" mean | \n",
" 3.857544e+03 | \n",
" 7.336222e+03 | \n",
" 5.254340e+07 | \n",
" 5.254340e-01 | \n",
" 3.857544e+03 | \n",
"
\n",
" \n",
" std | \n",
" 1.679764e+04 | \n",
" 1.224793e+02 | \n",
" 2.284365e+08 | \n",
" 2.284365e+00 | \n",
" 1.679764e+04 | \n",
"
\n",
" \n",
" min | \n",
" 1.000000e+00 | \n",
" 7.122000e+03 | \n",
" 1.300600e+04 | \n",
" 1.300600e-04 | \n",
" 1.000000e+00 | \n",
"
\n",
" \n",
" 25% | \n",
" 3.200000e+01 | \n",
" 7.223000e+03 | \n",
" 4.367360e+05 | \n",
" 4.367360e-03 | \n",
" 3.200000e+01 | \n",
"
\n",
" \n",
" 50% | \n",
" 2.000000e+02 | \n",
" 7.326500e+03 | \n",
" 2.732200e+06 | \n",
" 2.732200e-02 | \n",
" 2.000000e+02 | \n",
"
\n",
" \n",
" 75% | \n",
" 1.627000e+03 | \n",
" 7.460500e+03 | \n",
" 2.222291e+07 | \n",
" 2.222291e-01 | \n",
" 1.627000e+03 | \n",
"
\n",
" \n",
" max | \n",
" 2.486453e+06 | \n",
" 7.689000e+03 | \n",
" 3.328615e+10 | \n",
" 3.328615e+02 | \n",
" 2.486453e+06 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" size price grossValue homeNotional foreignNotional\n",
"count 1.309671e+06 1.309671e+06 1.309671e+06 1.309671e+06 1.309671e+06\n",
"mean 3.857544e+03 7.336222e+03 5.254340e+07 5.254340e-01 3.857544e+03\n",
"std 1.679764e+04 1.224793e+02 2.284365e+08 2.284365e+00 1.679764e+04\n",
"min 1.000000e+00 7.122000e+03 1.300600e+04 1.300600e-04 1.000000e+00\n",
"25% 3.200000e+01 7.223000e+03 4.367360e+05 4.367360e-03 3.200000e+01\n",
"50% 2.000000e+02 7.326500e+03 2.732200e+06 2.732200e-02 2.000000e+02\n",
"75% 1.627000e+03 7.460500e+03 2.222291e+07 2.222291e-01 1.627000e+03\n",
"max 2.486453e+06 7.689000e+03 3.328615e+10 3.328615e+02 2.486453e+06"
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['timestamp', 'symbol', 'side', 'size', 'price', 'tickDirection',\n",
" 'trdMatchID', 'grossValue', 'homeNotional', 'foreignNotional'],\n",
" dtype='object')"
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.columns"
]
},
{
"cell_type": "code",
"execution_count": 42,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" timestamp | \n",
" symbol | \n",
" side | \n",
" size | \n",
" price | \n",
" tickDirection | \n",
" trdMatchID | \n",
" grossValue | \n",
" homeNotional | \n",
" foreignNotional | \n",
"
\n",
" \n",
" \n",
" \n",
" count | \n",
" 1309671 | \n",
" 1309671 | \n",
" 1309671 | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
" 1309671 | \n",
" 1309671 | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
" 1.309671e+06 | \n",
"
\n",
" \n",
" unique | \n",
" 407403 | \n",
" 1 | \n",
" 2 | \n",
" NaN | \n",
" NaN | \n",
" 4 | \n",
" 1309671 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" top | \n",
" 2019-12-09 07:04:18.260564 | \n",
" XBTUSD | \n",
" Sell | \n",
" NaN | \n",
" NaN | \n",
" ZeroMinusTick | \n",
" 34370c9e-38b5-fdcd-a900-ec25969cffc1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" freq | \n",
" 1421 | \n",
" 1309671 | \n",
" 679823 | \n",
" NaN | \n",
" NaN | \n",
" 603821 | \n",
" 1 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" first | \n",
" 2019-12-09 00:00:03.137637 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" last | \n",
" 2019-12-11 23:59:59.861272 | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
"
\n",
" \n",
" mean | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 3.857544e+03 | \n",
" 7.336222e+03 | \n",
" NaN | \n",
" NaN | \n",
" 5.254340e+07 | \n",
" 5.254340e-01 | \n",
" 3.857544e+03 | \n",
"
\n",
" \n",
" std | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1.679764e+04 | \n",
" 1.224793e+02 | \n",
" NaN | \n",
" NaN | \n",
" 2.284365e+08 | \n",
" 2.284365e+00 | \n",
" 1.679764e+04 | \n",
"
\n",
" \n",
" min | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1.000000e+00 | \n",
" 7.122000e+03 | \n",
" NaN | \n",
" NaN | \n",
" 1.300600e+04 | \n",
" 1.300600e-04 | \n",
" 1.000000e+00 | \n",
"
\n",
" \n",
" 25% | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 3.200000e+01 | \n",
" 7.223000e+03 | \n",
" NaN | \n",
" NaN | \n",
" 4.367360e+05 | \n",
" 4.367360e-03 | \n",
" 3.200000e+01 | \n",
"
\n",
" \n",
" 50% | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 2.000000e+02 | \n",
" 7.326500e+03 | \n",
" NaN | \n",
" NaN | \n",
" 2.732200e+06 | \n",
" 2.732200e-02 | \n",
" 2.000000e+02 | \n",
"
\n",
" \n",
" 75% | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 1.627000e+03 | \n",
" 7.460500e+03 | \n",
" NaN | \n",
" NaN | \n",
" 2.222291e+07 | \n",
" 2.222291e-01 | \n",
" 1.627000e+03 | \n",
"
\n",
" \n",
" max | \n",
" NaN | \n",
" NaN | \n",
" NaN | \n",
" 2.486453e+06 | \n",
" 7.689000e+03 | \n",
" NaN | \n",
" NaN | \n",
" 3.328615e+10 | \n",
" 3.328615e+02 | \n",
" 2.486453e+06 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" timestamp symbol side size \\\n",
"count 1309671 1309671 1309671 1.309671e+06 \n",
"unique 407403 1 2 NaN \n",
"top 2019-12-09 07:04:18.260564 XBTUSD Sell NaN \n",
"freq 1421 1309671 679823 NaN \n",
"first 2019-12-09 00:00:03.137637 NaN NaN NaN \n",
"last 2019-12-11 23:59:59.861272 NaN NaN NaN \n",
"mean NaN NaN NaN 3.857544e+03 \n",
"std NaN NaN NaN 1.679764e+04 \n",
"min NaN NaN NaN 1.000000e+00 \n",
"25% NaN NaN NaN 3.200000e+01 \n",
"50% NaN NaN NaN 2.000000e+02 \n",
"75% NaN NaN NaN 1.627000e+03 \n",
"max NaN NaN NaN 2.486453e+06 \n",
"\n",
" price tickDirection trdMatchID \\\n",
"count 1.309671e+06 1309671 1309671 \n",
"unique NaN 4 1309671 \n",
"top NaN ZeroMinusTick 34370c9e-38b5-fdcd-a900-ec25969cffc1 \n",
"freq NaN 603821 1 \n",
"first NaN NaN NaN \n",
"last NaN NaN NaN \n",
"mean 7.336222e+03 NaN NaN \n",
"std 1.224793e+02 NaN NaN \n",
"min 7.122000e+03 NaN NaN \n",
"25% 7.223000e+03 NaN NaN \n",
"50% 7.326500e+03 NaN NaN \n",
"75% 7.460500e+03 NaN NaN \n",
"max 7.689000e+03 NaN NaN \n",
"\n",
" grossValue homeNotional foreignNotional \n",
"count 1.309671e+06 1.309671e+06 1.309671e+06 \n",
"unique NaN NaN NaN \n",
"top NaN NaN NaN \n",
"freq NaN NaN NaN \n",
"first NaN NaN NaN \n",
"last NaN NaN NaN \n",
"mean 5.254340e+07 5.254340e-01 3.857544e+03 \n",
"std 2.284365e+08 2.284365e+00 1.679764e+04 \n",
"min 1.300600e+04 1.300600e-04 1.000000e+00 \n",
"25% 4.367360e+05 4.367360e-03 3.200000e+01 \n",
"50% 2.732200e+06 2.732200e-02 2.000000e+02 \n",
"75% 2.222291e+07 2.222291e-01 1.627000e+03 \n",
"max 3.328615e+10 3.328615e+02 2.486453e+06 "
]
},
"execution_count": 42,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.describe(include='all')"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 28,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAZAAAAD4CAYAAADCb7BPAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAbfElEQVR4nO3de3BU9f3/8deGcJE74Ro3TGGzgcZcYGQj/EFtA6RBW0KVKHiZxAkaigzYUaQ6aavMwJDBf1TQajRplz8ILcxIOkMMTIWpYsWwQToN8EcmgpPbYEjCxSiBJO/vH/7c30QgCQfhuOX5+Cfkved8Pu+dMefl55yzZz1mZgIA4DpFud0AACAyESAAAEcIEACAIwQIAMARAgQA4Ei02w3cKuPGjdOUKVPcbgMAIsqpU6d05syZq7522wTIlClTFAqF3G4DACJKIBC45mucwgIAOEKAAAAcIUAAAI4QIAAAR/oMkLq6OqWnp+uuu+5SUlKSXnvtNUnSyy+/LK/Xq5kzZ2rmzJkqLy8P77Np0yb5/X5Nnz5de/fuDdcrKio0ffp0+f1+FRYWhusnT57U7Nmz5ff7tXTpUl26dEmS1NHRoaVLl8rv92v27Nk6depUn3MAAG4R60NjY6NVVVWZmdn58+ctISHBjh07Zi+99JK98sorV2x/7NgxS01NtYsXL9rnn39uPp/POjs7rbOz03w+n9XW1lpHR4elpqbasWPHzMzsoYcestLSUjMzW7Fihb355ptmZvbGG2/YihUrzMystLTUHn744V7n6M2sWbP6eqvALbd9+3ZLSkqyqKgoS0pKsu3bt7vdEtBDb8fOPlcgsbGxuvvuuyVJI0aMUGJiohoaGq65fVlZmZYtW6bBgwdr6tSp8vv9qqysVGVlpfx+v3w+nwYNGqRly5aprKxMZqb9+/crOztbkpSbm6vdu3eHx8rNzZUkZWdn64MPPpCZXXMOIJKUlpaqoKBAW7Zs0cWLF7VlyxYVFBSotLTU7daAfrmuayCnTp3SZ599ptmzZ0uStm7dqtTUVOXl5amtrU2S1NDQoMmTJ4f3iYuLU0NDwzXrLS0tGj16tKKjo3vUvz9WdHS0Ro0apZaWlmuO9X1FRUUKBAIKBAJqbm6+nrcK3HQbN25UcXGx0tPTNXDgQKWnp6u4uFgbN250uzWgX/odIF999ZWWLFmiV199VSNHjtTKlStVW1uro0ePKjY2Vs8999zN7NOR/Px8hUIhhUIhjR8/3u12gB5OnDihuXPn9qjNnTtXJ06ccKkj4Pr0K0AuX76sJUuW6LHHHtODDz4oSZo4caIGDBigqKgoPfXUU+FTSF6vV3V1deF96+vr5fV6r1kfO3aszp49q87Ozh7174/V2dmpc+fOaezYsdccC4gkiYmJOnjwYI/awYMHlZiY6FJHwPXpM0DMTMuXL1diYqKeffbZcL2pqSn87/fee0/JycmSpKysLO3YsUMdHR06efKkampqdM899ygtLU01NTU6efKkLl26pB07digrK0sej0fp6enatWuXJCkYDGrx4sXhsYLBoCRp165dmjdvnjwezzXnACJJQUGBli9frgMHDujy5cs6cOCAli9froKCArdbA/qnryvwH330kUmylJQUmzFjhs2YMcP27Nljjz/+uCUnJ1tKSootWrTIGhsbw/ts2LDBfD6fTZs2zcrLy8P1PXv2WEJCgvl8PtuwYUO4Xltba2lpaRYfH2/Z2dl28eJFMzP75ptvLDs72+Lj4y0tLc1qa2v7nMPJnQSAW7gLCz92vR07PWa3x3eiBwIBHqYIANept2Mnn0QHADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4EifAVJXV6f09HTdddddSkpK0muvvSZJam1tVUZGhhISEpSRkaG2tjZJkplpzZo18vv9Sk1N1ZEjR8JjBYNBJSQkKCEhQcFgMFyvqqpSSkqK/H6/1qxZIzNzPAcA4BaxPjQ2NlpVVZWZmZ0/f94SEhLs2LFj9vzzz9umTZvMzGzTpk22bt06MzPbs2ePLVy40Lq7u+2TTz6xe+65x8zMWlpabOrUqdbS0mKtra02depUa21tNTOztLQ0++STT6y7u9sWLlxo5eXlZmbXPUdvZs2a1ec2AICeejt29rkCiY2N1d133y1JGjFihBITE9XQ0KCysjLl5uZKknJzc7V7925JUllZmXJycuTxeDRnzhydPXtWTU1N2rt3rzIyMhQTE6MxY8YoIyNDFRUVampq0vnz5zVnzhx5PB7l5OT0GOt65gAA3DrXdQ3k1KlT+uyzzzR79mydPn1asbGxkqRJkybp9OnTkqSGhgZNnjw5vE9cXJwaGhp6rcfFxV1Rl3Tdc3xfUVGRAoGAAoGAmpubr+etAgD60O8A+eqrr7RkyRK9+uqrGjlyZI/XPB6PPB7PD97cjc6Rn5+vUCikUCik8ePH36TOAOD21K8AuXz5spYsWaLHHntMDz74oCRp4sSJ4dNGTU1NmjBhgiTJ6/Wqrq4uvG99fb28Xm+v9fr6+ivqTuYAANw6fQaImWn58uVKTEzUs88+G65nZWWF76QKBoNavHhxuL5t2zaZmQ4dOqRRo0YpNjZWmZmZ2rdvn9ra2tTW1qZ9+/YpMzNTsbGxGjlypA4dOiQz07Zt23qMdT1zAABuob6uwH/00UcmyVJSUmzGjBk2Y8YM27Nnj505c8bmzZtnfr/f5s+fby0tLWZm1t3dbU8//bT5fD5LTk62w4cPh8cqLi62+Ph4i4+Pt5KSknD98OHDlpSUZD6fz1atWmXd3d1mZo7mcHInAQDg6no7dnrM/t+HLv7HBQIBhUIht9sAgIjS27GTT6IDABwhQAAAjhAgAABHCBAAgCMECADAEQIEAOAIAQIAcIQAAQA4QoAAABwhQAAAjhAgAABHCBAAgCMECADAEQIEAOAIAQIAcIQAAQA4QoAAABwhQAAAjhAgAABHCBAAgCMECADAEQIEAOAIAQIAcIQAAQA4QoAAABwhQAAXlZaWKjk5WQMGDFBycrJKS0vdbgnoNwIEcElpaameeeYZtbe3S5La29v1zDPPECKIGAQI4JJ169YpOjpaJSUlunjxokpKShQdHa1169a53RrQLwQI4JL6+noFg0Glp6dr4MCBSk9PVzAYVH19vdutAf1CgAAAHCFAAJfExcUpJydHBw4c0OXLl3XgwAHl5OQoLi7O7daAfukzQPLy8jRhwgQlJyeHay+//LK8Xq9mzpypmTNnqry8PPzapk2b5Pf7NX36dO3duzdcr6io0PTp0+X3+1VYWBiunzx5UrNnz5bf79fSpUt16dIlSVJHR4eWLl0qv9+v2bNn69SpU33OAUSSzZs3q6urS3l5eRo8eLDy8vLU1dWlzZs3u90a0D/Wh3/9619WVVVlSUlJ4dpLL71kr7zyyhXbHjt2zFJTU+3ixYv2+eefm8/ns87OTuvs7DSfz2e1tbXW0dFhqampduzYMTMze+ihh6y0tNTMzFasWGFvvvmmmZm98cYbtmLFCjMzKy0ttYcffrjXOfoya9asPrcBbrXt27dbUlKSRUVFWVJSkm3fvt3tloAeejt29rkCuffeexUTE9OvMCorK9OyZcs0ePBgTZ06VX6/X5WVlaqsrJTf75fP59OgQYO0bNkylZWVycy0f/9+ZWdnS5Jyc3O1e/fu8Fi5ubmSpOzsbH3wwQcys2vOAUSiRx55RNXV1erq6lJ1dbUeeeQRt1sC+s3xNZCtW7cqNTVVeXl5amtrkyQ1NDRo8uTJ4W3i4uLU0NBwzXpLS4tGjx6t6OjoHvXvjxUdHa1Ro0appaXlmmNdTVFRkQKBgAKBgJqbm52+VQDAVTgKkJUrV6q2tlZHjx5VbGysnnvuuR+6rx9Efn6+QqGQQqGQxo8f73Y7APA/xVGATJw4UQMGDFBUVJSeeuqp8Ckkr9erurq68Hb19fXyer3XrI8dO1Znz55VZ2dnj/r3x+rs7NS5c+c0duzYa44FALi1HAVIU1NT+N/vvfde+A6trKws7dixQx0dHTp58qRqamp0zz33KC0tTTU1NTp58qQuXbqkHTt2KCsrSx6PR+np6dq1a5ckKRgMavHixeGxgsGgJGnXrl2aN2+ePB7PNecAIhHPwkJE6+sK/LJly2zSpEkWHR1tXq/X3n33XXv88cctOTnZUlJSbNGiRdbY2BjefsOGDebz+WzatGlWXl4eru/Zs8cSEhLM5/PZhg0bwvXa2lpLS0uz+Ph4y87OtosXL5qZ2TfffGPZ2dkWHx9vaWlpVltb2+ccTu8kANywfft2mzp1qu3fv98uXbpk+/fvt6lTp3InFn5Uejt2eszM3A6xWyEQCCgUCrndBhCWnJysLVu2KD09PVw7cOCAVq9ererqahc7A/6/3o6dfBIdcMmJEyc0d+7cHrW5c+fqxIkTLnUEXB8CBHBJYmKiDh482KN28OBBJSYmutQRcH0IEMAlBQUFWr58eY9nYS1fvlwFBQVutwb0S7TbDQC3q+8+db569WqdOHFCiYmJ2rhxI59GR8RgBQIAcIQVCOCS0tJSFRQUqLi4WHPnztXBgwe1fPlySWIVgojACgRwycaNG1VcXNzjGwmLi4u1ceNGt1sD+oUAAVxy4sQJ7dy5U0OGDJHH49GQIUO0c+dObuNFxCBAAJeMHj1ab731lkaPHn3V34EfOwIEcMm5c+fk8Xi0bt06tbe3a926dfJ4PDp37pzbrQH9QoAALunq6tLatWtVUlKiESNGqKSkRGvXrlVXV5fbrQH9QoAALho3blyPbyQcN26c2y0B/cZtvIBLYmJi9OKLL2rAgAH67W9/q7feeksvvvhiv79CGnAbKxDAJVu3btXQoUP1wgsvaNiwYXrhhRc0dOhQbd261e3WgH4hQACXPPLII3r77bc1bdo0RUVFadq0aXr77bf5ECEiBgECAHCEayCAS3iUCSIdKxDAJRs3btSjjz6q1atXa8iQIVq9erUeffRRHmWCiMEKBHDJ8ePH1d7erpKSkvAKJC8vT1988YXbrQH9QoAALhk0aJC8Xq/uu+8+dXR0aPDgwQoEAmpqanK7NaBfOIUFuKSjo0Mff/yx8vLydPbsWeXl5enjjz9WR0eH260B/UKAAC7xeDxasGCBPvzwQ8XExOjDDz/UggUL5PF43G4N6BcCBHCJmeno0aNqb2+XJLW3t+vo0aMyM5c7A/qHayCAS6Kjo3XhwgWdO3dO3d3damhoUFRUlKKj+bNEZGAFArhk8ODB6ujo0JNPPqmzZ8/qySefDF9MByIBAQK4pL29XVlZWSopKdHo0aNVUlKirKys8Ckt4MeOAAFclJSUJL/fr6ioKPn9fiUlJbndEtBvBAjgkpiYGG3evFl5eXm6cOGC8vLytHnzZh7njohBgAAuGTp0qIYPH64tW7ZoxIgR2rJli4YPH66hQ4e63RrQLwQI4JLGxkZt2bJFw4YNkyQNGzZMW7ZsUWNjo8udAf3TZ4Dk5eVpwoQJSk5ODtdaW1uVkZGhhIQEZWRkqK2tTdK397WvWbNGfr9fqampOnLkSHifYDCohIQEJSQkKBgMhutVVVVKSUmR3+/XmjVrwvfAO5kDiCSJiYl6/fXXdfz4cXV3d+v48eN6/fXXlZiY6HZrQL/0GSBPPPGEKioqetQKCws1f/581dTUaP78+SosLJQkvf/++6qpqVFNTY2Kioq0cuVKSd+Gwfr16/Xpp5+qsrJS69evDwfCypUr9c4774T3+26u650DiDRRUVEKhUJatGiRmpubtWjRIoVCIUVFcWIAkaHP/1LvvffeKy7qlZWVKTc3V5KUm5ur3bt3h+s5OTnyeDyaM2eOzp49q6amJu3du1cZGRmKiYnRmDFjlJGRoYqKCjU1Nen8+fOaM2eOPB6PcnJyeox1PXMAkaa6uloLFixQbW2tJk6cqNraWi1YsEDV1dVutwb0i6P/1Tl9+rRiY2MlSZMmTdLp06clSQ0NDZo8eXJ4u7i4ODU0NPRaj4uLu6LuZI6rKSoqUiAQUCAQUHNzs5O3Ctw0ZqalS5f2qC1dupRHmSBi3PAzEzwez01/+JvTOfLz85Wfny9JCgQCP3RbwA179tlnVVZWFv4+kMWLF7vdEtBvjlYgEydODJ82ampq0oQJEyRJXq9XdXV14e3q6+vl9Xp7rdfX119RdzIHEGmGDRumCxcuaMmSJRo8eLCWLFmiCxcuhO/KAn7sHAVIVlZW+E6qYDAY/r+mrKwsbdu2TWamQ4cOadSoUYqNjVVmZqb27duntrY2tbW1ad++fcrMzFRsbKxGjhypQ4cOycy0bdu2HmNdzxxApPn66681cOBAtbW1yczU1tamgQMH6uuvv3a7NaB/rA/Lli2zSZMmWXR0tHm9Xnv33XftzJkzNm/ePPP7/TZ//nxraWkxM7Pu7m57+umnzefzWXJysh0+fDg8TnFxscXHx1t8fLyVlJSE64cPH7akpCTz+Xy2atUq6+7uNjNzNEdvZs2a1a/tgFslOjrahg8fblOmTDGPx2NTpkyx4cOHW3R0tNutAWG9HTs9ZrfHFbtAIKBQKOR2G0DYd9f1Jk6cqC+//FITJkwI3yxym/xZIgL0duzkhnPARYMGDVJra6vMTK2trRo0aJDbLQH9RoAALurs7FRhYaHa29tVWFiozs5Ot1sC+o1TWIBLPB6PoqOje4TGd7/fJn+WiACcwgJ+pDo7O8OPLomKimIFgohCgAAuGz9+fI+fQKQgQAAXDRkyRK2trZK+fejokCFDXO4I6D8CBHDRgAED5PV65fF45PV6NWDAALdbAvrthp+FBcCZAQMGqL29Xe3t7ZKkU6dOhetAJGAFArikq6vruurAjw0BAgBwhAABXDZmzJgeP4FIQYAALvvqq696/AQiBQECuCwtLU2NjY1KS0tzuxXgunAXFuCyf//737rzzjvdbgO4bqxAAACOECAAAEcIEMBl332x1Hc/gUhBgAAu++7R7TzCHZGGAAEAOEKAAAAcIUAAAI4QIAAARwgQAIAjBAgAwBECBADgCAECAHCEAAEAOEKAAAAcIUAAAI4QIAAARwgQAIAjNxQgU6ZMUUpKimbOnKlAICBJam1tVUZGhhISEpSRkaG2tjZJ3z5pdM2aNfL7/UpNTdWRI0fC4wSDQSUkJCghIUHBYDBcr6qqUkpKivx+v9asWRN+Wum15gAA3Do3vAI5cOCAjh49qlAoJEkqLCzU/PnzVVNTo/nz56uwsFCS9P7776umpkY1NTUqKirSypUrJX0bBuvXr9enn36qyspKrV+/PhwIK1eu1DvvvBPer6Kiotc5AAC3zg9+CqusrEy5ubmSpNzcXO3evTtcz8nJkcfj0Zw5c3T27Fk1NTVp7969ysjIUExMjMaMGaOMjAxVVFSoqalJ58+f15w5c+TxeJSTk9NjrKvNAQC4dW4oQDwej375y19q1qxZKioqkiSdPn1asbGxkqRJkybp9OnTkqSGhgZNnjw5vG9cXJwaGhp6rcfFxV1R722O7ysqKlIgEFAgEFBzc/ONvFUAwPdE38jOBw8elNfr1ZdffqmMjAz99Kc/7fG6x+O56V/T2dsc+fn5ys/Pl6TwNRoAwA/jhlYgXq9XkjRhwgQ98MADqqys1MSJE9XU1CRJampq0oQJE8Lb1tXVhfetr6+X1+vttV5fX39FXdI15wAA3DqOA6S9vV0XLlwI/3vfvn1KTk5WVlZW+E6qYDCoxYsXS5KysrK0bds2mZkOHTqkUaNGKTY2VpmZmdq3b5/a2trU1tamffv2KTMzU7GxsRo5cqQOHTokM9O2bdt6jHW1OQAAt47jU1inT5/WAw88IEnq7OzUo48+qoULFyotLU0PP/ywiouL9ZOf/ER///vfJUn333+/ysvL5ff7NXToUP3lL3+RJMXExOiPf/yj0tLSJEl/+tOfFBMTI0l688039cQTT+ibb77Rfffdp/vuu0+S9MILL1x1DgDAreOx7z5c8T8uEAiEbzUGfgx6uz54m/xZIgL0duzkk+gAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHCFAAACOECAAAEcIEACAIwQIAMARAgQA4AgBAgBwhAABADhCgAAAHInoAKmoqND06dPl9/tVWFjodjsAcFuJ2ADp6urSqlWr9P777+v48eMqLS3V8ePH3W4LAG4bERsglZWV8vv98vl8GjRokJYtW6aysjK32wKA20a02w041dDQoMmTJ4d/j4uL06efftpjm6KiIhUVFUmSmpubb2l/+N+QEky5aWMn/zX5ls/739z/3pRxcXuK2ADpj/z8fOXn50uSAoGAy90gEt3MA67H47nma2Z20+YFfigRewrL6/Wqrq4u/Ht9fb28Xq+LHQHA7SViAyQtLU01NTU6efKkLl26pB07digrK8vttoB+u9Yqg9UHIkXEnsKKjo7W1q1blZmZqa6uLuXl5SkpKcnttoDrQlggkkVsgEjS/fffr/vvv9/tNgDgthSxp7AAAO4iQAAAjhAgAABHCBAAgCMeu01uAxk3bpymTJnidhvAVTU3N2v8+PFutwFc4dSpUzpz5sxVX7ttAgT4MQsEAgqFQm63AVwXTmEBABwhQAAAjhAgwI/Adw/9BCIJ10AAAI6wAgEAOEKAAAAcIUAAFzz55JM6fvy4220AN4RrIAAAR1iBADdZe3u7fvWrX2nGjBlKTk7W3/72N/3iF79QKBTSP/7xD82cOVMzZ87U9OnTNXXqVElSVVWVfv7zn2vWrFnKzMxUU1OTy+8CuBIBAtxkFRUVuvPOO/Wf//xH1dXVWrhwYfi1rKwsHT16VEePHtWMGTO0du1aXb58WatXr9auXbtUVVWlvLw8FRQUuPgOgKuL6C+UAiJBSkqKnnvuOf3+97/Xr3/9a/3sZz+7YpvNmzfrjjvu0KpVq1RdXa3q6mplZGRIkrq6uhQbG3ur2wb6RIAAN9m0adN05MgRlZeX6w9/+IPmz5/f4/V//vOf2rlzpz788ENJ337NbVJSkj755BM32gX6jVNYwE3W2NiooUOH6vHHH9fzzz+vI0eOhF/74osvtGrVKu3cuVN33HGHJGn69Olqbm4OB8jly5d17NgxV3oHesMKBLjJ/vvf/+r5559XVFSUBg4cqD//+c9au3atJOmvf/2rWlpa9Jvf/EaSdOedd6q8vFy7du3SmjVrdO7cOXV2dup3v/udkpKS3HwbwBW4jRcA4AinsAAAjhAgAABHCBAAgCMECADAEQIEAOAIAQIAcIQAAQA48n9/MnTixaYubQAAAABJRU5ErkJggg==\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"size\"].plot(kind=\"box\")"
]
},
{
"cell_type": "code",
"execution_count": 45,
"metadata": {},
"outputs": [],
"source": [
"df['size_clip'] = df['size'].clip(df['size'].quantile(0.01), df['size'].quantile(0.99))"
]
},
{
"cell_type": "code",
"execution_count": 46,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 46,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAYMAAAD5CAYAAADFqlkBAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAAX2UlEQVR4nO3df2zU9eH48efNfnAwBVptobYNyK5DKD+KXqFbsjnsbrVIikxS2JwtgmlSyerYj2AyYPaf2Zks2Q9lSUnHrmaxIn9QM6F00xkzZ2mOqpv263Is7dJ2t1qhBcf4We/7h9lFBrSVUQryfPxFX3fv971ef9w9uff7fXeBRCKRQJJ0TfvUeE9AkjT+jIEkyRhIkoyBJAljIEkCUsZ7Ahfr5ptvZubMmeM9DUm6anR1dfHee++d97arNgYzZ84kGo2O9zQk6aoRCoUueJuHiSRJxkCSZAwkSRgDSRLGQJLEVXw1kXSlCQQC54z5PZC6WvjOQLoEzheC4calK40xkCQZA0mSMZAkYQwkSYwyBoODg6xatYrbbruNOXPm8Nprr3H48GHC4TC5ubmEw2EGBgaAD6+eqK6uJhgMsmDBAtrb25P7iUQi5ObmkpubSyQSSY4fOHCA+fPnEwwGqa6u9goMSbrMRhWDRx55hLvvvpt33nmHN998kzlz5lBbW0tRURGxWIyioiJqa2sB2Lt3L7FYjFgsRl1dHVVVVQAcPnyYmpoa9u/fT1tbGzU1NcmAVFVVsX379uR2zc3NY7RcSdL5jBiDI0eO8Morr7B+/XoAJkyYwNSpU2lqaqKiogKAiooKdu/eDUBTUxPl5eUEAgEKCwsZHBwkHo+zb98+wuEwaWlppKamEg6HaW5uJh6Pc/ToUQoLCwkEApSXlyf3JUm6PEaMQWdnJ+np6Tz44IMsWrSIhx56iGPHjtHX10dmZiYA06dPp6+vD4De3l5ycnKS22dnZ9Pb2zvseHZ29jnj51NXV0coFCIUCtHf339xK5YknWPEGJw5c4b29naqqqp4/fXX+cxnPpM8JPQfgUDgsny4prKykmg0SjQaJT09fcwfT5KuFSPGIDs7m+zsbJYsWQLAqlWraG9vZ9q0acTjcQDi8TgZGRkAZGVl0d3dndy+p6eHrKysYcd7enrOGZckXT4jxmD69Onk5OTw17/+FYAXX3yRuXPnUlpamrwiKBKJsGLFCgBKS0tpaGggkUjQ2trKlClTyMzMpLi4mJaWFgYGBhgYGKClpYXi4mIyMzOZPHkyra2tJBIJGhoakvuSJF0eo/qiul/84hfcf//9nDp1ilmzZrFjxw4++OADysrKqK+vZ8aMGezcuROAZcuWsWfPHoLBIJMmTWLHjh0ApKWlsWXLFgoKCgDYunUraWlpAGzbto21a9dy/PhxSkpKKCkpGYu1SpIuIJC4Si/qD4VC/gayrhjDnTO7Sp9i+gQa7nXTTyBLkoyBJMkYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIwBpIkjIEkCWMgScIYSJIYZQxmzpzJ/Pnzyc/PJxQKAXD48GHC4TC5ubmEw2EGBgYASCQSVFdXEwwGWbBgAe3t7cn9RCIRcnNzyc3NJRKJJMcPHDjA/PnzCQaDVFdXk0gkLuUaJUkjGPU7gz/84Q+88cYbRKNRAGpraykqKiIWi1FUVERtbS0Ae/fuJRaLEYvFqKuro6qqCvgwHjU1Nezfv5+2tjZqamqSAamqqmL79u3J7Zqbmy/1OiVJw7jow0RNTU1UVFQAUFFRwe7du5Pj5eXlBAIBCgsLGRwcJB6Ps2/fPsLhMGlpaaSmphIOh2lubiYej3P06FEKCwsJBAKUl5cn9yVJujxGFYNAIMBXv/pV7rjjDurq6gDo6+sjMzMTgOnTp9PX1wdAb28vOTk5yW2zs7Pp7e0ddjw7O/uc8fOpq6sjFAoRCoXo7+//mEuVJF1Iymju9Mc//pGsrCzeffddwuEwt91221m3BwIBAoHAmEzwoyorK6msrARInruQJP3vRvXOICsrC4CMjAxWrlxJW1sb06ZNIx6PAxCPx8nIyEjet7u7O7ltT08PWVlZw4739PScMy5JunxGjMGxY8d4//33k/9uaWlh3rx5lJaWJq8IikQirFixAoDS0lIaGhpIJBK0trYyZcoUMjMzKS4upqWlhYGBAQYGBmhpaaG4uJjMzEwmT55Ma2sriUSChoaG5L4kSZfHiIeJ+vr6WLlyJQBnzpzhG9/4BnfffTcFBQWUlZVRX1/PjBkz2LlzJwDLli1jz549BINBJk2axI4dOwBIS0tjy5YtFBQUALB161bS0tIA2LZtG2vXruX48eOUlJRQUlIyJouVJJ1fIHGVXtQfCoWSl7lK4224c2ZX6VNMn0DDvW76CWRJkjGQJBkDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRIfIwZDQ0MsWrSI5cuXA9DZ2cmSJUsIBoOsXr2aU6dOAXDy5ElWr15NMBhkyZIldHV1Jffx+OOPEwwGmT17Nvv27UuONzc3M3v2bILBILW1tZdoaZKk0Rp1DH72s58xZ86c5N+bNm1i48aNHDx4kNTUVOrr6wGor68nNTWVgwcPsnHjRjZt2gRAR0cHjY2NvP322zQ3N/Pwww8zNDTE0NAQGzZsYO/evXR0dPDMM8/Q0dFxiZcpSRrOqGLQ09PDCy+8wEMPPQRAIpHgpZdeYtWqVQBUVFSwe/duAJqamqioqABg1apVvPjiiyQSCZqamlizZg3XX389t956K8FgkLa2Ntra2ggGg8yaNYsJEyawZs0ampqaxmKtkqQLGFUMvv3tb/PEE0/wqU99ePdDhw4xdepUUlJSAMjOzqa3txeA3t5ecnJyAEhJSWHKlCkcOnTorPGPbnOh8fOpq6sjFAoRCoXo7++/iOVKks5nxBj89re/JSMjgzvuuONyzGdYlZWVRKNRotEo6enp4z0dSfrESBnpDq+++irPP/88e/bs4cSJExw9epRHHnmEwcFBzpw5Q0pKCj09PWRlZQGQlZVFd3c32dnZnDlzhiNHjnDTTTclx//jo9tcaFySdHmM+M7g8ccfp6enh66uLhobG7nrrrv4zW9+w9KlS9m1axcAkUiEFStWAFBaWkokEgFg165d3HXXXQQCAUpLS2lsbOTkyZN0dnYSi8VYvHgxBQUFxGIxOjs7OXXqFI2NjZSWlo7hkiVJ/23EdwYX8uMf/5g1a9awefNmFi1axPr16wFYv349DzzwAMFgkLS0NBobGwHIy8ujrKyMuXPnkpKSwlNPPcV1110HwJNPPklxcTFDQ0OsW7eOvLy8S7A0SdJoBRKJRGK8J3ExQqEQ0Wh0vKchARAIBC5421X6FNMn0HCvm34CWZJkDCRJxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCShDGQJGEMJEkYA0kSxkCSxChicOLECRYvXszChQvJy8vjhz/8IQCdnZ0sWbKEYDDI6tWrOXXqFAAnT55k9erVBINBlixZQldXV3Jfjz/+OMFgkNmzZ7Nv377keHNzM7NnzyYYDFJbW3uJlyhJGsmIMbj++ut56aWXePPNN3njjTdobm6mtbWVTZs2sXHjRg4ePEhqair19fUA1NfXk5qaysGDB9m4cSObNm0CoKOjg8bGRt5++22am5t5+OGHGRoaYmhoiA0bNrB37146Ojp45pln6OjoGNtVS5LOMmIMAoEAN9xwAwCnT5/m9OnTBAIBXnrpJVatWgVARUUFu3fvBqCpqYmKigoAVq1axYsvvkgikaCpqYk1a9Zw/fXXc+uttxIMBmlra6OtrY1gMMisWbOYMGECa9asoampaazWK0k6j1GdMxgaGiI/P5+MjAzC4TCf/exnmTp1KikpKQBkZ2fT29sLQG9vLzk5OQCkpKQwZcoUDh06dNb4R7e50Pj51NXVEQqFCIVC9Pf3X9yKJUnnGFUMrrvuOt544w16enpoa2vjnXfeGet5nVdlZSXRaJRoNEp6evq4zEGSPok+1tVEU6dOZenSpbz22msMDg5y5swZAHp6esjKygIgKyuL7u5uAM6cOcORI0e46aabzhr/6DYXGpckXT4jxqC/v5/BwUEAjh8/zu9+9zvmzJnD0qVL2bVrFwCRSIQVK1YAUFpaSiQSAWDXrl3cddddBAIBSktLaWxs5OTJk3R2dhKLxVi8eDEFBQXEYjE6Ozs5deoUjY2NlJaWjtV6JUnnkTLSHeLxOBUVFQwNDfHBBx9QVlbG8uXLmTt3LmvWrGHz5s0sWrSI9evXA7B+/XoeeOABgsEgaWlpNDY2ApCXl0dZWRlz584lJSWFp556iuuuuw6AJ598kuLiYoaGhli3bh15eXljuGRJ0n8LJBKJxHhP4mKEQiGi0eh4T0MCPrzq7kKu0qeYPoGGe930E8iSJGMgSTIGkiSMgSQJYyBJwhhIkjAGkiSMgSQJYyBJwhhIkjAGkiSMgSQJYyBJwhhIkjAGkiSMgSQJYyBJwhhIkjAGkiSMgSQJYyBJwhhIkjAGkiRGEYPu7m6WLl3K3LlzycvL42c/+xkAhw8fJhwOk5ubSzgcZmBgAIBEIkF1dTXBYJAFCxbQ3t6e3FckEiE3N5fc3FwikUhy/MCBA8yfP59gMEh1dTWJROJSr1OSNIwRY5CSksJPfvITOjo6aG1t5amnnqKjo4Pa2lqKioqIxWIUFRVRW1sLwN69e4nFYsRiMerq6qiqqgI+jEdNTQ379++nra2NmpqaZECqqqrYvn17crvm5uYxXLIk6b+NGIPMzExuv/12AG688UbmzJlDb28vTU1NVFRUAFBRUcHu3bsBaGpqory8nEAgQGFhIYODg8Tjcfbt20c4HCYtLY3U1FTC4TDNzc3E43GOHj1KYWEhgUCA8vLy5L4kSZdHyse5c1dXF6+//jpLliyhr6+PzMxMAKZPn05fXx8Avb295OTkJLfJzs6mt7d32PHs7Oxzxs+nrq6Ouro6APr7+z/O1CVJwxj1CeR//etf3Hffffz0pz9l8uTJZ90WCAQIBAKXfHL/rbKykmg0SjQaJT09fcwfT5KuFaOKwenTp7nvvvu4//77+drXvgbAtGnTiMfjAMTjcTIyMgDIysqiu7s7uW1PTw9ZWVnDjvf09JwzLkm6fEaMQSKRYP369cyZM4fvfOc7yfHS0tLkFUGRSIQVK1YkxxsaGkgkErS2tjJlyhQyMzMpLi6mpaWFgYEBBgYGaGlpobi4mMzMTCZPnkxrayuJRIKGhobkviRJl8eI5wxeffVVnn76aebPn09+fj4AP/rRj3j00UcpKyujvr6eGTNmsHPnTgCWLVvGnj17CAaDTJo0iR07dgCQlpbGli1bKCgoAGDr1q2kpaUBsG3bNtauXcvx48cpKSmhpKRkTBYrSTq/QOIqvag/FAoRjUbHexoSwLDnzK7Sp5g+gYZ73fQTyJIkYyBJMgaSJIyBJAljIEnCGEiSMAaSJIyBJAljIEnCGEiSMAaSJIyBJAljIEnCGEiSMAaSJIyBJAljIEnCGEiSMAaSJIyBJAljIEnCGEiSMAaSJEYRg3Xr1pGRkcG8efOSY4cPHyYcDpObm0s4HGZgYACARCJBdXU1wWCQBQsW0N7entwmEomQm5tLbm4ukUgkOX7gwAHmz59PMBikurqaRCJxKdcnSRqFEWOwdu1ampubzxqrra2lqKiIWCxGUVERtbW1AOzdu5dYLEYsFqOuro6qqirgw3jU1NSwf/9+2traqKmpSQakqqqK7du3J7f778eSJI29EWPwpS99ibS0tLPGmpqaqKioAKCiooLdu3cnx8vLywkEAhQWFjI4OEg8Hmffvn2Ew2HS0tJITU0lHA7T3NxMPB7n6NGjFBYWEggEKC8vT+5LknT5pFzMRn19fWRmZgIwffp0+vr6AOjt7SUnJyd5v+zsbHp7e4cdz87OPmf8Qurq6qirqwOgv7//YqYuSTqP//kEciAQIBAIXIq5jKiyspJoNEo0GiU9Pf2yPKYkXQsuKgbTpk0jHo8DEI/HycjIACArK4vu7u7k/Xp6esjKyhp2vKen55xxSdLldVExKC0tTV4RFIlEWLFiRXK8oaGBRCJBa2srU6ZMITMzk+LiYlpaWhgYGGBgYICWlhaKi4vJzMxk8uTJtLa2kkgkaGhoSO5LknT5jHjO4Otf/zovv/wy7733HtnZ2dTU1PDoo49SVlZGfX09M2bMYOfOnQAsW7aMPXv2EAwGmTRpEjt27AAgLS2NLVu2UFBQAMDWrVuTJ6W3bdvG2rVrOX78OCUlJZSUlIzVWiVJFxBIXKUX9odCIaLR6HhPQwIY9rzZVfoU0yfQcK+bfgJZkmQMJEnGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSRgDSRLGQJKEMZAkYQwkSVzkz15K14pL8St+o92H326q8WQMpGGM9gXar7DW1c7DRJIkYyBdChf637/vCnS1MAbSJZJIJEgkEszY9Nvkv6WrhTGQJBkDSZIxkCThpaW6xiysaeHI8dNj/jgzH31hTPc/ZeL/8eYPvzqmj6FrizHQNeXI8dN01d4z3tP4n411bHTtuWIOEzU3NzN79myCwSC1tbXjPR1JuqZcETEYGhpiw4YN7N27l46ODp555hk6OjrGe1qSdM24Ig4TtbW1EQwGmTVrFgBr1qyhqamJuXPnjvPM9Elz45xHmR95dLyn8T+7cQ7A1X+4S1eOKyIGvb295OTkJP/Ozs5m//7959yvrq6Ouro6APr7+y/b/PTJ8f7/+3iHIP/+4+VjNJNzzdj021Hfd8rE/xvDmehadEXEYLQqKyuprKwEIBQKjfNsdDX62CePa/0Usa4NV8Q5g6ysLLq7u5N/9/T0kJWVNY4zkqRryxURg4KCAmKxGJ2dnZw6dYrGxkZKS0vHe1qSdM24Ig4TpaSk8OSTT1JcXMzQ0BDr1q0jLy9vvKclSdeMKyIGAMuWLWPZsmXjPQ1JuiZdEYeJJEnjyxhIkoyBJMkYSJKAQOIq/W2+m2++mZkzZ473NKRz9Pf3k56ePt7TkM7R1dXFe++9d97brtoYSFeqUChENBod72lIH4uHiSRJxkCSZAykS+4/X6YoXU08ZyBJ8p2BJMkYSJIwBpIkjIF0Xg899BAdHR1jtv+uri7mzZsHQDQapbq6esweSxoNTyBL46Crq4vly5fz1ltvjfdUJMB3BhLHjh3jnnvuYeHChcybN49nn32WL3/5y0SjUZ5//nny8/PJz89n9uzZ3HrrrQAcOHCAO++8kzvuuIPi4mLi8fgF93/w4EG+8pWvsHDhQm6//Xb+9re/nXX7yy+/zPLlywF47LHHeOCBB/j85z9Pbm4u27dvH7uFSx9xxfy4jTRempubueWWW3jhhRcAOHLkCL/85S8BKC0tTf4Ea1lZGXfeeSenT5/mW9/6Fk1NTaSnp/Pss8/ygx/8gF/96lfn3f/999/Po48+ysqVKzlx4gQffPAB77777gXn8+c//5nW1laOHTvGokWLuOeee7jlllsu8aqlsxkDXfPmz5/Pd7/7XTZt2sTy5cv54he/eM59nnjiCSZOnMiGDRt46623eOuttwiHwwAMDQ2RmZl53n2///779Pb2snLlSgA+/elPjzifFStWMHHiRCZOnMjSpUtpa2vj3nvv/R9WKI3MGOia97nPfY729nb27NnD5s2bKSoqOuv23//+9zz33HO88sorACQSCfLy8njttdfGZD6BQGDYv6Wx4DkDXfP+8Y9/MGnSJL75zW/y/e9/n/b29uRtf//739mwYQPPPfccEydOBGD27Nn09/cnY3D69Gnefvvt8+77xhtvJDs7m927dwNw8uRJ/v3vfw87n6amJk6cOMGhQ4d4+eWXKSgouBTLlIZlDHTN+8tf/sLixYvJz8+npqaGzZs3J2/79a9/zaFDh7j33nvJz89n2bJlTJgwgV27drFp0yYWLlxIfn4+f/rTny64/6effpqf//znLFiwgC984Qv885//HHY+CxYsYOnSpRQWFrJlyxbPF+iy8NJS6Qry2GOPccMNN/C9731vvKeia4zvDCRJvjOQLpUNGzbw6quvnjX2yCOP8OCDD47TjKTRMwaSJA8TSZKMgSQJYyBJwhhIkoD/D18KFD/XDpzGAAAAAElFTkSuQmCC\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df.size_clip.plot(kind='box')"
]
},
{
"cell_type": "code",
"execution_count": 47,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 47,
"metadata": {},
"output_type": "execute_result"
},
{
"data": {
"image/png": "iVBORw0KGgoAAAANSUhEUgAAAX0AAAD8CAYAAACb4nSYAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4xLjIsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy8li6FKAAASiUlEQVR4nO3cf2jU9+H48efVFKmsi0kxml6Cul3mNP6cifrPfmi4pliJ6yrR/agZWjKszNWtRWFqyT8zKwxWmAwOgjvHWGbzhyk1xnS6MVaq4XSWqR/hBDOSW8hSTdxWWk2y+/5Rlu+s0aStSWxfzwcEzOve7/e9XhCfd7zuRySbzWaRJAXhgcmegCRp4hh9SQqI0ZekgBh9SQqI0ZekgBh9SQrImKI/Z84cFi1axNKlSykrKwPg2rVrxONxSkpKiMfj9PX1AZDNZtmxYwexWIzFixdz9uzZ4eskk0lKSkooKSkhmUyOw3IkSXcz5mf6f/jDHzh37hypVAqA+vp6KioqSKfTVFRUUF9fD8CxY8dIp9Ok02kSiQTbtm0D3n+QqKur4/Tp07S3t1NXVzf8QCFJmhgfeXunubmZmpoaAGpqajhy5Mjw+ObNm4lEIqxatYr+/n66u7s5fvw48Xic/Px88vLyiMfjtLa23ptVSJLGJGcsB0UiER577DEikQjf+973qK2tpaenh8LCQgBmzZpFT08PAJlMhuLi4uFzi4qKyGQydxz/oEQiQSKRAODSpUt88Ytf/Oirk+6xM2fO3PG25cuXT+BMpDvr6Ojg7bffHvG2MUX/z3/+M9FolH/84x/E4/HbQhyJRIhEIh9/pkBtbS21tbUAlJWVDW8nSfeDu/2d+7eq+8V/X3sdyZi2d6LRKAAFBQU8+eSTtLe3M3PmTLq7uwHo7u6moKBg+NjOzs7hc7u6uohGo3cclyRNnFGj/8477/Cvf/1r+N9tbW0sXLiQqqqq4XfgJJNJ1q9fD0BVVRWHDh0im81y6tQpcnNzKSwspLKykra2Nvr6+ujr66OtrY3KyspxXJok6YNG3d7p6enhySefBGBwcJBvfetbPP7445SXl1NdXU1DQwOzZ8/m8OHDAKxdu5aWlhZisRjTpk3j4MGDAOTn57N3717Ky8sB2LdvH/n5+eO1LknSCCL381cru6ev+83d9vTv4/9KCszd2uknciUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIEZfkgJi9CUpIGOO/tDQEMuWLWPdunUAXLlyhZUrVxKLxdi4cSM3b94E4MaNG2zcuJFYLMbKlSvp6OgYvsb+/fuJxWLMmzeP48eP39uVSJJGNebov/zyy8yfP3/49127drFz504uX75MXl4eDQ0NADQ0NJCXl8fly5fZuXMnu3btAuDixYs0NjZy4cIFWltbefbZZxkaGrrHy5Ek3c2Yot/V1cXRo0d55plnAMhms5w8eZINGzYAUFNTw5EjRwBobm6mpqYGgA0bNnDixAmy2SzNzc1s2rSJqVOnMnfuXGKxGO3t7eOxJulDi0QiY/r5uNeQJtuYov/cc8/x0ksv8cAD7x9+9epVpk+fTk5ODgBFRUVkMhkAMpkMxcXFAOTk5JCbm8vVq1dvGf/gOf8rkUhQVlZGWVkZvb29H2910hhls9kx/Xzca0iTbdTov/baaxQUFLB8+fKJmA+1tbWkUilSqRQzZsyYkPuUpFDkjHbAG2+8wauvvkpLSwvvvfce//znP/nBD35Af38/g4OD5OTk0NXVRTQaBSAajdLZ2UlRURGDg4Ncv36dRx55ZHj8v/73HOmTIpvNjrhN47N4fVKM+kx///79dHV10dHRQWNjI2vWrOE3v/kNq1evpqmpCYBkMsn69esBqKqqIplMAtDU1MSaNWuIRCJUVVXR2NjIjRs3uHLlCul0mhUrVozj0qTx8d+tmtm7XnPbRp84oz7Tv5Of/vSnbNq0iT179rBs2TK2bt0KwNatW3n66aeJxWLk5+fT2NgIQGlpKdXV1SxYsICcnBwOHDjAlClT7s0qJEljEsnex09TysrKSKVSkz0NaURzdh+lo/6JyZ6GdJu7tdNP5EpSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAXE6EtSQIy+JAVk1Oi/9957rFixgiVLllBaWsqLL74IwJUrV1i5ciWxWIyNGzdy8+ZNAG7cuMHGjRuJxWKsXLmSjo6O4Wvt37+fWCzGvHnzOH78+PisSJJ0R6NGf+rUqZw8eZK33nqLc+fO0drayqlTp9i1axc7d+7k8uXL5OXl0dDQAEBDQwN5eXlcvnyZnTt3smvXLgAuXrxIY2MjFy5coLW1lWeffZahoaHxXZ0k6RajRj8SifCZz3wGgIGBAQYGBohEIpw8eZINGzYAUFNTw5EjRwBobm6mpqYGgA0bNnDixAmy2SzNzc1s2rSJqVOnMnfuXGKxGO3t7eO1LknSCMa0pz80NMTSpUspKCggHo/z+c9/nunTp5OTkwNAUVERmUwGgEwmQ3FxMQA5OTnk5uZy9erVW8Y/eM7/SiQSlJWVUVZWRm9v78deoCTp/xtT9KdMmcK5c+fo6uqivb2dS5cujduEamtrSaVSpFIpZsyYMW73I0kh+lDv3pk+fTqrV6/mzTffpL+/n8HBQQC6urqIRqMARKNROjs7ARgcHOT69es88sgjt4x/8BxJ0sQYNfq9vb309/cD8O677/L6668zf/58Vq9eTVNTEwDJZJL169cDUFVVRTKZBKCpqYk1a9YQiUSoqqqisbGRGzducOXKFdLpNCtWrBivdUmSRpAz2gHd3d3U1NQwNDTEf/7zH6qrq1m3bh0LFixg06ZN7Nmzh2XLlrF161YAtm7dytNPP00sFiM/P5/GxkYASktLqa6uZsGCBeTk5HDgwAGmTJkyvquTJN0iks1ms5M9iTspKysjlUpN9jSkEc3ZfZSO+icmexrSbe7WTj+RK0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBMfqSFBCjL0kBGTX6nZ2drF69mgULFlBaWsrLL78MwLVr14jH45SUlBCPx+nr6wMgm82yY8cOYrEYixcv5uzZs8PXSiaTlJSUUFJSQjKZHKclSZLuZNTo5+Tk8LOf/YyLFy9y6tQpDhw4wMWLF6mvr6eiooJ0Ok1FRQX19fUAHDt2jHQ6TTqdJpFIsG3bNuD9B4m6ujpOnz5Ne3s7dXV1ww8UkqSJMWr0CwsL+dKXvgTAww8/zPz588lkMjQ3N1NTUwNATU0NR44cAaC5uZnNmzcTiURYtWoV/f39dHd3c/z4ceLxOPn5+eTl5RGPx2ltbR3HpUmSPijnwxzc0dHBX/7yF1auXElPTw+FhYUAzJo1i56eHgAymQzFxcXD5xQVFZHJZO44/kGJRIJEIgFAb2/vh1+RJOmOxvxC7r///W+eeuopfv7zn/PZz372ltsikQiRSOSeTKi2tpZUKkUqlWLGjBn35JqSpPeNKfoDAwM89dRTfPvb3+Yb3/gGADNnzqS7uxuA7u5uCgoKAIhGo3R2dg6f29XVRTQaveO4JGnijBr9bDbL1q1bmT9/Pj/84Q+Hx6uqqobfgZNMJlm/fv3w+KFDh8hms5w6dYrc3FwKCwuprKykra2Nvr4++vr6aGtro7KycpyWJUkayah7+m+88Qa//vWvWbRoEUuXLgXgJz/5Cbt376a6upqGhgZmz57N4cOHAVi7di0tLS3EYjGmTZvGwYMHAcjPz2fv3r2Ul5cDsG/fPvLz88drXZKkEUSy2Wx2sidxJ2VlZaRSqcmehjSiObuP0lH/xGRPQ7rN3drpJ3IlKSBGX5ICYvQlKSBGX5ICYvQlKSBGX5ICYvQlKSBGX5ICYvQlKSBGX5IC8qG+T1/6pFhS18b1dwfG/X7m7D46rtfPfehB3nrxsXG9D4XF6OtT6fq7A5+K78UZ7wcVhcftHUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpIAYfUkKiNGXpICMGv0tW7ZQUFDAwoULh8euXbtGPB6npKSEeDxOX18fANlslh07dhCLxVi8eDFnz54dPieZTFJSUkJJSQnJZHIcliJJGs2o0f/ud79La2vrLWP19fVUVFSQTqepqKigvr4egGPHjpFOp0mn0yQSCbZt2wa8/yBRV1fH6dOnaW9vp66ubviBQpI0cUaN/le+8hXy8/NvGWtubqampgaAmpoajhw5Mjy+efNmIpEIq1ator+/n+7ubo4fP048Hic/P5+8vDzi8fhtDySSpPGX81FO6unpobCwEIBZs2bR09MDQCaTobi4ePi4oqIiMpnMHcdHkkgkSCQSAPT29n6U6UmS7uBjv5AbiUSIRCL3Yi4A1NbWkkqlSKVSzJgx455dV5L0EaM/c+ZMuru7Aeju7qagoACAaDRKZ2fn8HFdXV1Eo9E7jkuSJtZHin5VVdXwO3CSySTr168fHj906BDZbJZTp06Rm5tLYWEhlZWVtLW10dfXR19fH21tbVRWVt67VUiSxmTUPf1vfvOb/PGPf+Ttt9+mqKiIuro6du/eTXV1NQ0NDcyePZvDhw8DsHbtWlpaWojFYkybNo2DBw8CkJ+fz969eykvLwdg3759t704LEkaf6NG/7e//e2I4ydOnLhtLBKJcODAgRGP37JlC1u2bPmQ05Mk3Ut+IleSAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SAmL0JSkgRl+SApIz2ROQxsPD83ezKLl7sqfxsT08H+CJyZ6GPkWMvj6V/vV/9XTUf/JjOWf30cmegj5l3N6RpIAYfUkKiNGXpIAYfUkKyIRHv7W1lXnz5hGLxaivr5/ou5ekoE3ou3eGhobYvn07r7/+OkVFRZSXl1NVVcWCBQsmchoKxKfhnS+5Dz042VPQp8yERr+9vZ1YLMbnPvc5ADZt2kRzc7PR1z03EW/XnLP76KfibaEKy4RGP5PJUFxcPPx7UVERp0+fvuWYRCJBIpEA4NKlS5SVlU3kFKUxy/b2Uvb7GZM9Dek2HR0dd7ztvvtwVm1tLbW1tZM9DWlUZWVlpFKpyZ6G9KFM6Au50WiUzs7O4d+7urqIRqMTOQVJCtqERr+8vJx0Os2VK1e4efMmjY2NVFVVTeQUJCloE7q9k5OTwy9+8QsqKysZGhpiy5YtlJaWTuQUpHvGbUh9EkWy2Wx2sichSZoYfiJXkgJi9CUpIEZfGqNnnnmGixcvTvY0pI/FPX1JCojP9KURvPPOOzzxxBMsWbKEhQsX8rvf/Y6vfe1rpFIpXn31VZYuXcrSpUuZN28ec+fOBeDMmTN89atfZfny5VRWVtLd3T3Jq5BuZ/SlEbS2tvLoo4/y1ltvcf78eR5//PHh26qqqjh37hznzp1jyZIlPP/88wwMDPD973+fpqYmzpw5w5YtW/jxj388iSuQRnbffQ2DdD9YtGgRP/rRj9i1axfr1q3jy1/+8m3HvPTSSzz00ENs376d8+fPc/78eeLxOPD+N8oWFhZO9LSlURl9aQRf+MIXOHv2LC0tLezZs4eKiopbbv/973/PK6+8wp/+9CcAstkspaWlvPnmm5MxXWnM3N6RRvD3v/+dadOm8Z3vfIcXXniBs2fPDt/2t7/9je3bt/PKK6/w0EMPATBv3jx6e3uHoz8wMMCFCxcmZe7S3fhMXxrBX//6V1544QUeeOABHnzwQX75y1/y/PPPA/CrX/2Kq1ev8vWvfx2ARx99lJaWFpqamtixYwfXr19ncHCQ5557zq8Z0X3Ht2xKUkDc3pGkgBh9SQqI0ZekgBh9SQqI0ZekgBh9SQqI0ZekgPw/RZ2WF0q2rH4AAAAASUVORK5CYII=\n",
"text/plain": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"df[\"size\"].plot(kind=\"box\", ylim=(0, 5000))"
]
},
{
"cell_type": "code",
"execution_count": 48,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 (7462.2, 7575.6]\n",
"1 (7462.2, 7575.6]\n",
"2 (7462.2, 7575.6]\n",
"3 (7462.2, 7575.6]\n",
"4 (7462.2, 7575.6]\n",
" ... \n",
"1309666 (7121.433, 7235.4]\n",
"1309667 (7121.433, 7235.4]\n",
"1309668 (7121.433, 7235.4]\n",
"1309669 (7121.433, 7235.4]\n",
"1309670 (7121.433, 7235.4]\n",
"Name: price_discrete_bins, Length: 1309671, dtype: category\n",
"Categories (5, interval[float64]): [(7121.433, 7235.4] < (7235.4, 7348.8] < (7348.8, 7462.2] < (7462.2, 7575.6] < (7575.6, 7689.0]]"
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['price_discrete_bins'] = pd.cut(df.price, 5)\n",
"df['price_discrete_bins']"
]
},
{
"cell_type": "code",
"execution_count": 50,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7121.433, 7235.4] 419820\n",
"(7235.4, 7348.8] 367043\n",
"(7462.2, 7575.6] 301654\n",
"(7348.8, 7462.2] 200231\n",
"(7575.6, 7689.0] 20923\n",
"Name: price_discrete_bins, dtype: int64"
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['price_discrete_bins'].value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 53,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 high\n",
"1 high\n",
"2 high\n",
"3 high\n",
"4 high\n",
" ... \n",
"1309666 very low\n",
"1309667 very low\n",
"1309668 very low\n",
"1309669 very low\n",
"1309670 very low\n",
"Name: price_discrete_labels, Length: 1309671, dtype: category\n",
"Categories (5, object): [very low < low < mid < high < very high]"
]
},
"execution_count": 53,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['price_discrete_labels'] = pd.cut(df.price, 5, labels=['very low', 'low', 'mid', 'high', 'very high'])\n",
"df['price_discrete_labels']"
]
},
{
"cell_type": "code",
"execution_count": 55,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"very low 0.320554\n",
"low 0.280256\n",
"high 0.230328\n",
"mid 0.152886\n",
"very high 0.015976\n",
"Name: price_discrete_labels, dtype: float64"
]
},
"execution_count": 55,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.price_discrete_labels.value_counts(normalize=True)"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 (7477.0, 7689.0]\n",
"1 (7477.0, 7689.0]\n",
"2 (7477.0, 7689.0]\n",
"3 (7477.0, 7689.0]\n",
"4 (7477.0, 7689.0]\n",
" ... \n",
"1309666 (7121.999, 7212.5]\n",
"1309667 (7121.999, 7212.5]\n",
"1309668 (7121.999, 7212.5]\n",
"1309669 (7121.999, 7212.5]\n",
"1309670 (7121.999, 7212.5]\n",
"Name: price_discrete_equal_bins, Length: 1309671, dtype: category\n",
"Categories (5, interval[float64]): [(7121.999, 7212.5] < (7212.5, 7292.5] < (7292.5, 7348.5] < (7348.5, 7477.0] < (7477.0, 7689.0]]"
]
},
"execution_count": 76,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df['price_discrete_equal_bins'] = pd.qcut(df.price, 5)\n",
"df['price_discrete_equal_bins']"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7121.999, 7212.5] 264009\n",
"(7292.5, 7348.5] 262610\n",
"(7348.5, 7477.0] 262396\n",
"(7477.0, 7689.0] 260412\n",
"(7212.5, 7292.5] 260244\n",
"Name: price_discrete_equal_bins, dtype: int64"
]
},
"execution_count": 77,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.price_discrete_equal_bins.value_counts()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 59,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"None == None"
]
},
{
"cell_type": "code",
"execution_count": 62,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 62,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.np.nan == pd.np.nan"
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/r.orac/envs/r/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"text/plain": [
"nan"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.np.log(-1)"
]
},
{
"cell_type": "code",
"execution_count": 74,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/r.orac/envs/r/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in arccos\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"text/plain": [
"nan"
]
},
"execution_count": 74,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.np.arccos(2)"
]
},
{
"cell_type": "code",
"execution_count": 75,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/r.orac/envs/r/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in log\n",
" \"\"\"Entry point for launching an IPython kernel.\n",
"/Users/r.orac/envs/r/lib/python3.7/site-packages/ipykernel_launcher.py:1: RuntimeWarning: invalid value encountered in arccos\n",
" \"\"\"Entry point for launching an IPython kernel.\n"
]
},
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 75,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"pd.np.log(-1) == pd.np.arccos(2)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}