{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline" ] }, { "cell_type": "code", "execution_count": 62, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import time\n", "from platform import python_version\n", "\n", "import matplotlib\n", "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", "import sklearn\n", "import torch\n", "import torch.nn as nn\n", "import torch.nn.functional as F\n", "import transformers\n", "from sklearn.metrics import roc_auc_score\n", "from torch.autograd import Variable" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "python version==3.7.3\n", "pandas==0.25.3\n", "numpy==1.17.4\n", "torch==1.3.1\n", "sklearn==0.21.0\n", "transformers==2.1.1\n", "matplotlib==3.0.3\n" ] } ], "source": [ "print(\"python version==%s\" % python_version())\n", "print(\"pandas==%s\" % pd.__version__)\n", "print(\"numpy==%s\" % np.__version__)\n", "print(\"torch==%s\" % torch.__version__)\n", "print(\"sklearn==%s\" % sklearn.__version__)\n", "print(\"transformers==%s\" % transformers.__version__)\n", "print(\"matplotlib==%s\" % matplotlib.__version__)" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "logging.getLogger(\"transformers.tokenization_utils\").setLevel(logging.ERROR)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(159571, 8)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = pd.read_csv('data/train.csv')\n", "df.shape" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "np.random.seed(42)\n", "df = df.sample(frac=1)\n", "df = df.reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
idcomment_texttoxicsevere_toxicobscenethreatinsultidentity_hate
07ca72b5b9c688e9eGeez, are you forgetful! We've already discus...000000
1c03f72fd8f8bf54fCarioca RFA \\n\\nThanks for your support on my ...000000
29e5b8e8fc1ff2e84\"\\n\\n Birthday \\n\\nNo worries, It's what I do ...000000
35332799e706665a6Pseudoscience category? \\n\\nI'm assuming that ...000000
4dfa7d8f0b4366680(and if such phrase exists, it would be provid...000000
\n", "
" ], "text/plain": [ " id comment_text toxic \\\n", "0 7ca72b5b9c688e9e Geez, are you forgetful! We've already discus... 0 \n", "1 c03f72fd8f8bf54f Carioca RFA \\n\\nThanks for your support on my ... 0 \n", "2 9e5b8e8fc1ff2e84 \"\\n\\n Birthday \\n\\nNo worries, It's what I do ... 0 \n", "3 5332799e706665a6 Pseudoscience category? \\n\\nI'm assuming that ... 0 \n", "4 dfa7d8f0b4366680 (and if such phrase exists, it would be provid... 0 \n", "\n", " severe_toxic obscene threat insult identity_hate \n", "0 0 0 0 0 0 \n", "1 0 0 0 0 0 \n", "2 0 0 0 0 0 \n", "3 0 0 0 0 0 \n", "4 0 0 0 0 0 " ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.head()" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "\"Geez, are you forgetful! We've already discussed why Marx was not an anarchist, i.e. he wanted to use a State to mold his 'socialist man.' Ergo, he is a statist - the opposite of an anarchist. I know a guy who says that, when he gets old and his teeth fall out, he'll quit eating meat. Would you call him a vegetarian?\"" ] }, "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.comment_text[0]" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toxicsevere_toxicobscenethreatinsultidentity_hate
103111010
\n", "
" ], "text/plain": [ " toxic severe_toxic obscene threat insult identity_hate\n", "103 1 1 1 0 1 0" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "target_columns = [\"toxic\", \"severe_toxic\", \"obscene\", \"threat\", \"insult\", \"identity_hate\"]\n", "df.iloc[[103]][target_columns]" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [], "source": [ "df_train = df[:10000].reset_index(drop=True)\n", "df_val = df[10000:11000].reset_index(drop=True)\n", "df_test = df[11000:13000].reset_index(drop=True)" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(10000, 8)" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_train.shape" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(1000, 8)" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_val.shape" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(2000, 8)" ] }, "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test.shape" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "model_class = transformers.BertModel\n", "tokenizer_class = transformers.BertTokenizer\n", "pretrained_weights='bert-base-uncased'" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "I1127 22:16:28.545769 4576552384 configuration_utils.py:151] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-config.json from cache at /Users/r.orac/.cache/torch/transformers/4dad0251492946e18ac39290fcfe91b89d370fee250efe9521476438fe8ca185.bf3b9ea126d8c0001ee8a1e8b92229871d06d36d8808208cc2449280da87785c\n", "I1127 22:16:28.547996 4576552384 configuration_utils.py:168] Model config {\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"finetuning_task\": null,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"num_labels\": 2,\n", " \"output_attentions\": false,\n", " \"output_hidden_states\": false,\n", " \"output_past\": true,\n", " \"pruned_heads\": {},\n", " \"torchscript\": false,\n", " \"type_vocab_size\": 2,\n", " \"use_bfloat16\": false,\n", " \"vocab_size\": 30522\n", "}\n", "\n", "I1127 22:16:29.254769 4576552384 modeling_utils.py:337] loading weights file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-uncased-pytorch_model.bin from cache at /Users/r.orac/.cache/torch/transformers/aa1ef1aede4482d0dbcd4d52baad8ae300e60902e88fcb0bebdec09afd232066.36ca03ab34a1a5d5fa7bc3d03d55c4fa650fed07220e2eeebc06ce58d0e9a157\n" ] } ], "source": [ "tokenizer = tokenizer_class.from_pretrained(pretrained_weights)\n", "bert_model = model_class.from_pretrained(pretrained_weights)" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "max_seq = 100" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "def tokenize_text(df, max_seq):\n", " return [\n", " tokenizer.encode(text, add_special_tokens=True)[:max_seq] for text in df.comment_text.values\n", " ]\n", "\n", "\n", "def pad_text(tokenized_text, max_seq):\n", " return np.array([el + [0] * (max_seq - len(el)) for el in tokenized_text])\n", "\n", "\n", "def tokenize_and_pad_text(df, max_seq):\n", " tokenized_text = tokenize_text(df, max_seq)\n", " padded_text = pad_text(tokenized_text, max_seq)\n", " return torch.tensor(padded_text)\n", "\n", "\n", "def targets_to_tensor(df, target_columns):\n", " return torch.tensor(df[target_columns].values, dtype=torch.float32)" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "train_indices = tokenize_and_pad_text(df_train, max_seq)\n", "val_indices = tokenize_and_pad_text(df_val, max_seq)\n", "test_indices = tokenize_and_pad_text(df_test, max_seq)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "with torch.no_grad():\n", " x_train = bert_model(train_indices)[0] # Models outputs are tuples\n", " x_val = bert_model(val_indices)[0]\n", " x_test = bert_model(test_indices)[0]" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "y_train = targets_to_tensor(df_train, target_columns)\n", "y_val = targets_to_tensor(df_val, target_columns)\n", "y_test = targets_to_tensor(df_test, target_columns)" ] }, { "cell_type": "code", "execution_count": 138, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 0.1020, -0.1540, -0.1991, ..., -0.0927, 0.9687, 0.1253],\n", " [ 0.5115, 0.6774, 1.4377, ..., 1.0570, 0.3752, -0.3614],\n", " [ 0.0124, 0.1622, 1.1159, ..., 0.8882, 0.6164, -0.2798],\n", " ...,\n", " [ 0.1322, 0.0337, 1.0933, ..., -0.6233, 0.1783, -1.1930],\n", " [ 0.0658, 0.0356, 1.0270, ..., -0.6100, 0.0813, -1.1758],\n", " [ 0.2795, 0.3124, 0.8268, ..., -0.6755, -0.0943, -1.2319]])" ] }, "execution_count": 138, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train[0]" ] }, { "cell_type": "code", "execution_count": 139, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "torch.Size([100, 768])" ] }, "execution_count": 139, "metadata": {}, "output_type": "execute_result" } ], "source": [ "x_train[0].shape" ] }, { "cell_type": "code", "execution_count": 142, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([0., 0., 0., 0., 0., 0.])" ] }, "execution_count": 142, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_train[0]" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "class KimCNN(nn.Module):\n", " def __init__(self, embed_num, embed_dim, class_num, kernel_num, kernel_sizes, dropout, static):\n", " super(KimCNN, self).__init__()\n", "\n", " V = embed_num\n", " D = embed_dim\n", " C = class_num\n", " Co = kernel_num\n", " Ks = kernel_sizes\n", " \n", " self.static = static\n", " self.embed = nn.Embedding(V, D)\n", " self.convs1 = nn.ModuleList([nn.Conv2d(1, Co, (K, D)) for K in Ks])\n", " self.dropout = nn.Dropout(dropout)\n", " self.fc1 = nn.Linear(len(Ks) * Co, C)\n", " self.sigmoid = nn.Sigmoid()\n", " \n", "\n", " def forward(self, x):\n", " if self.static:\n", " x = Variable(x)\n", "\n", " x = x.unsqueeze(1) # (N, Ci, W, D)\n", "\n", " x = [F.relu(conv(x)).squeeze(3) for conv in self.convs1] # [(N, Co, W), ...]*len(Ks)\n", "\n", " x = [F.max_pool1d(i, i.size(2)).squeeze(2) for i in x] # [(N, Co), ...]*len(Ks)\n", "\n", " x = torch.cat(x, 1)\n", " x = self.dropout(x) # (N, len(Ks)*Co)\n", " logit = self.fc1(x) # (N, C)\n", " output = self.sigmoid(logit)\n", " return output" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [], "source": [ "embed_num = x_train.shape[1]\n", "embed_dim = x_train.shape[2]\n", "class_num = y_train.shape[1]\n", "kernel_num = 3\n", "kernel_sizes = [2, 3, 4]\n", "dropout = 0.5\n", "static = True" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "model = KimCNN(\n", " embed_num=embed_num,\n", " embed_dim=embed_dim,\n", " class_num=class_num,\n", " kernel_num=kernel_num,\n", " kernel_sizes=kernel_sizes,\n", " dropout=dropout,\n", " static=static,\n", ")" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "n_epochs = 10\n", "batch_size = 10\n", "lr = 0.001\n", "optimizer = torch.optim.Adam(model.parameters(), lr=lr)\n", "loss_fn = nn.BCELoss()" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def generate_batch_data(x, y, batch_size):\n", " i, batch = 0, 0\n", " for batch, i in enumerate(range(0, len(x) - batch_size, batch_size), 1):\n", " x_batch = x[i : i + batch_size]\n", " y_batch = y[i : i + batch_size]\n", " yield x_batch, y_batch, batch\n", " if i + batch_size < len(x):\n", " yield x[i + batch_size :], y[i + batch_size :], batch + 1\n", " if batch == 0:\n", " yield x, y, 1" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Epoch 1 Train loss: 0.20. Validation loss: 0.09. Elapsed time: 37.81s.\n", "Epoch 2 Train loss: 0.13. Validation loss: 0.07. Elapsed time: 36.39s.\n", "Epoch 3 Train loss: 0.10. Validation loss: 0.07. Elapsed time: 36.37s.\n", "Epoch 4 Train loss: 0.09. Validation loss: 0.07. Elapsed time: 36.62s.\n", "Epoch 5 Train loss: 0.09. Validation loss: 0.07. Elapsed time: 35.92s.\n", "Epoch 6 Train loss: 0.09. Validation loss: 0.07. Elapsed time: 35.42s.\n", "Epoch 7 Train loss: 0.08. Validation loss: 0.06. Elapsed time: 36.55s.\n", "Epoch 8 Train loss: 0.08. Validation loss: 0.06. Elapsed time: 35.41s.\n", "Epoch 9 Train loss: 0.08. Validation loss: 0.07. Elapsed time: 36.03s.\n", "Epoch 10 Train loss: 0.08. Validation loss: 0.07. Elapsed time: 35.94s.\n" ] } ], "source": [ "train_losses, val_losses = [], []\n", "\n", "for epoch in range(n_epochs):\n", " start_time = time.time()\n", " train_loss = 0\n", "\n", " model.train(True)\n", " for x_batch, y_batch, batch in generate_batch_data(x_train, y_train, batch_size):\n", " y_pred = model(x_batch)\n", " optimizer.zero_grad()\n", " loss = loss_fn(y_pred, y_batch)\n", " loss.backward()\n", " optimizer.step()\n", " train_loss += loss.item()\n", "\n", " train_loss /= batch\n", " train_losses.append(train_loss)\n", " elapsed = time.time() - start_time\n", "\n", " model.eval() # disable dropout for deterministic output\n", " with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations\n", " val_loss, batch = 0, 1\n", " for x_batch, y_batch, batch in generate_batch_data(x_val, y_val, batch_size):\n", " y_pred = model(x_batch)\n", " loss = loss_fn(y_pred, y_batch)\n", " val_loss += loss.item()\n", " val_loss /= batch\n", " val_losses.append(val_loss)\n", "\n", " print(\n", " \"Epoch %d Train loss: %.2f. Validation loss: %.2f. Elapsed time: %.2fs.\"\n", " % (epoch + 1, train_losses[-1], val_losses[-1], elapsed)\n", " )" ] }, { "cell_type": "code", "execution_count": 37, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Text(0.5, 1.0, 'Losses')" ] }, "execution_count": 37, "metadata": {}, "output_type": "execute_result" }, { "data": { "image/png": "\n", "text/plain": [ "
" ] }, "metadata": { "needs_background": "light" }, "output_type": "display_data" } ], "source": [ "plt.plot(train_losses, label=\"Training loss\")\n", "plt.plot(val_losses, label=\"Validation loss\")\n", "plt.legend()\n", "plt.title(\"Losses\")" ] }, { "cell_type": "code", "execution_count": 52, "metadata": {}, "outputs": [], "source": [ "model.eval() # disable dropout for deterministic output\n", "with torch.no_grad(): # deactivate autograd engine to reduce memory usage and speed up computations\n", " y_preds = []\n", " batch = 0\n", " for x_batch, y_batch, batch in generate_batch_data(x_test, y_test, batch_size):\n", " y_pred = model(x_batch)\n", " y_preds.extend(y_pred.cpu().numpy().tolist())\n", " y_preds_np = np.array(y_preds)" ] }, { "cell_type": "code", "execution_count": 55, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[4.31922777e-03, 1.02647872e-08, 7.67312944e-04, 4.82944529e-08,\n", " 5.48943179e-04, 1.47456039e-05],\n", " [1.93794966e-02, 1.65772087e-06, 4.40812251e-03, 3.24758662e-06,\n", " 4.04525641e-03, 1.62638054e-04],\n", " [1.14875985e-03, 7.16099610e-11, 1.43278172e-04, 7.72443276e-10,\n", " 9.23425468e-05, 1.27330247e-06],\n", " ...,\n", " [3.68908630e-03, 4.57207561e-09, 5.53303165e-04, 2.50337173e-08,\n", " 4.26724349e-04, 8.35142146e-06],\n", " [9.97485360e-04, 4.76052600e-11, 1.12171409e-04, 5.47074053e-10,\n", " 7.51411499e-05, 8.22096808e-07],\n", " [8.12641159e-02, 3.14069737e-04, 3.06670386e-02, 2.79268977e-04,\n", " 2.92679444e-02, 3.49535886e-03]])" ] }, "execution_count": 55, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_preds_np" ] }, { "cell_type": "code", "execution_count": 53, "metadata": {}, "outputs": [], "source": [ "y_test_np = df_test[target_columns].values" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "array([[1, 0, 1, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " ...,\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0],\n", " [0, 0, 0, 0, 0, 0]])" ] }, "execution_count": 60, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_test_np[1000:]" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
labelauc
1severe_toxic0.966361
4insult0.959854
0toxic0.954778
3threat0.946667
5identity_hate0.941165
2obscene0.939816
\n", "
" ], "text/plain": [ " label auc\n", "1 severe_toxic 0.966361\n", "4 insult 0.959854\n", "0 toxic 0.954778\n", "3 threat 0.946667\n", "5 identity_hate 0.941165\n", "2 obscene 0.939816" ] }, "execution_count": 74, "metadata": {}, "output_type": "execute_result" } ], "source": [ "auc_scores = roc_auc_score(y_test_np, y_preds_np, average=None)\n", "df_accuracy = pd.DataFrame({\"label\": target_columns, \"auc\": auc_scores})\n", "df_accuracy.sort_values('auc')[::-1]" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "2201" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "positive_labels = df_train[target_columns].sum().sum()\n", "positive_labels" ] }, { "cell_type": "code", "execution_count": 83, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "60000" ] }, "execution_count": 83, "metadata": {}, "output_type": "execute_result" } ], "source": [ "all_labels = df_train[target_columns].count().sum()\n", "all_labels" ] }, { "cell_type": "code", "execution_count": 84, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "0.03668333333333333" ] }, "execution_count": 84, "metadata": {}, "output_type": "execute_result" } ], "source": [ "positive_labels/all_labels" ] }, { "cell_type": "code", "execution_count": 102, "metadata": {}, "outputs": [], "source": [ "df_test_targets = df_test[target_columns]\n", "df_pred_targets = pd.DataFrame(y_preds_np.round(), columns=target_columns, dtype=int)\n", "df_sanity = df_test_targets.join(df_pred_targets, how='inner', rsuffix='_pred')" ] }, { "cell_type": "code", "execution_count": 104, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toxicsevere_toxicobscenethreatinsultidentity_hatetoxic_predsevere_toxic_predobscene_predthreat_predinsult_predidentity_hate_pred
0000000000000
1000000000000
2000000000000
3000000000000
4000000000000
.......................................
1995000000000000
1996000000000000
1997000000000000
1998000000000000
1999000000000000
\n", "

2000 rows × 12 columns

\n", "
" ], "text/plain": [ " toxic severe_toxic obscene threat insult identity_hate toxic_pred \\\n", "0 0 0 0 0 0 0 0 \n", "1 0 0 0 0 0 0 0 \n", "2 0 0 0 0 0 0 0 \n", "3 0 0 0 0 0 0 0 \n", "4 0 0 0 0 0 0 0 \n", "... ... ... ... ... ... ... ... \n", "1995 0 0 0 0 0 0 0 \n", "1996 0 0 0 0 0 0 0 \n", "1997 0 0 0 0 0 0 0 \n", "1998 0 0 0 0 0 0 0 \n", "1999 0 0 0 0 0 0 0 \n", "\n", " severe_toxic_pred obscene_pred threat_pred insult_pred \\\n", "0 0 0 0 0 \n", "1 0 0 0 0 \n", "2 0 0 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 \n", "... ... ... ... ... \n", "1995 0 0 0 0 \n", "1996 0 0 0 0 \n", "1997 0 0 0 0 \n", "1998 0 0 0 0 \n", "1999 0 0 0 0 \n", "\n", " identity_hate_pred \n", "0 0 \n", "1 0 \n", "2 0 \n", "3 0 \n", "4 0 \n", "... ... \n", "1995 0 \n", "1996 0 \n", "1997 0 \n", "1998 0 \n", "1999 0 \n", "\n", "[2000 rows x 12 columns]" ] }, "execution_count": 104, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sanity" ] }, { "cell_type": "code", "execution_count": 127, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "toxic 186\n", "severe_toxic 17\n", "obscene 98\n", "threat 5\n", "insult 96\n", "identity_hate 18\n", "dtype: int64" ] }, "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_test_targets.sum()" ] }, { "cell_type": "code", "execution_count": 128, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "toxic 91\n", "severe_toxic 0\n", "obscene 43\n", "threat 0\n", "insult 23\n", "identity_hate 0\n", "dtype: int64" ] }, "execution_count": 128, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_pred_targets.sum()" ] }, { "cell_type": "code", "execution_count": 134, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
toxictoxic_pred
1210
2210
2710
4511
5311
.........
196411
196511
197710
198410
198610
\n", "

186 rows × 2 columns

\n", "
" ], "text/plain": [ " toxic toxic_pred\n", "12 1 0\n", "22 1 0\n", "27 1 0\n", "45 1 1\n", "53 1 1\n", "... ... ...\n", "1964 1 1\n", "1965 1 1\n", "1977 1 0\n", "1984 1 0\n", "1986 1 0\n", "\n", "[186 rows x 2 columns]" ] }, "execution_count": 134, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df_sanity[df_sanity.toxic > 0][['toxic', 'toxic_pred']]" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.3" } }, "nbformat": 4, "nbformat_minor": 4 }