{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Collect Example\n",
    "\n",
    "This example is going to show how to collect a intermediary output from a PyTorch model and create a dataset with it, that can be used for analisys or for training probes and other  techniques."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "tags": [
     "Imports"
    ]
   },
   "outputs": [],
   "source": [
    "import os # Path operations\n",
    "import glob # List files\n",
    "import shutil # Remove generated files\n",
    "\n",
    "import torch # PyTorch\n",
    "from torch.utils.data import Dataset, DataLoader # Creating the example dataset and dataloader\n",
    "\n",
    "from pytorch_probing import collect, CollectedDataset # Collect dataset and load it"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For this example, we create a example model that has two linear layers with a ReLU activation:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "class ExampleModel(torch.nn.Module):\n",
    "    def __init__(self, input_size, hidden_size, output_size):\n",
    "        super().__init__()\n",
    "\n",
    "        self.linear1 = torch.nn.Linear(input_size, hidden_size)\n",
    "        self.relu = torch.nn.ReLU()\n",
    "        self.linear2 = torch.nn.Linear(hidden_size, output_size)\n",
    "\n",
    "    def forward(self, x):\n",
    "        x = self.linear1(x)\n",
    "        x = self.relu(x)\n",
    "        x = self.linear2(x)\n",
    "\n",
    "        return x"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "ExampleModel(\n",
       "  (linear1): Linear(in_features=2, out_features=3, bias=True)\n",
       "  (relu): ReLU()\n",
       "  (linear2): Linear(in_features=3, out_features=1, bias=True)\n",
       ")"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "input_size = 2\n",
    "hidden_size = 3\n",
    "output_size = 1\n",
    "\n",
    "model = ExampleModel(input_size, hidden_size, output_size)\n",
    "model.eval()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And a example dataset that generates inputs and targets with the value index:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "class ExampleDataset(Dataset):\n",
    "    def __init__(self, x_size, y_size, len) -> None:\n",
    "        super().__init__()\n",
    "\n",
    "        self._x_size = x_size\n",
    "        self._y_size = y_size\n",
    "        self._len = len\n",
    "\n",
    "    def __len__(self) -> int:\n",
    "        return self._len\n",
    "    \n",
    "    def __getitem__(self, idx:int):\n",
    "        return torch.empty(self._x_size).fill_(idx), torch.empty(self._y_size).fill_(idx)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We create the dataset and a dataloader from it:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "dataset_size = 32\n",
    "batch_size = 4\n",
    "\n",
    "dataset = ExampleDataset(input_size, output_size, dataset_size)\n",
    "dataloader = DataLoader(dataset, batch_size, shuffle=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And can finally collect the dataset. We pass the model, the \"linear1\" output as the intermediary output to collect, the dataloader, a name for the dataset, and enable the saving of the model inputs, targets and predictions. The function will execute the model over the dataloader elements, and store the required values with the intercepted outputs:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "paths = [\"linear1\"]\n",
    "\n",
    "dataset_path = collect(model, paths, dataloader, dataset_name=\"CollectExample\", \n",
    "                       save_input=True, save_target=True, save_prediction=True)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "After collecting, the dataset is stored in the `dataset_path` path in chunks:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "tags": [
     "Check dataset files"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['.\\\\CollectExample\\\\0.pt',\n",
       " '.\\\\CollectExample\\\\1.pt',\n",
       " '.\\\\CollectExample\\\\2.pt',\n",
       " '.\\\\CollectExample\\\\3.pt',\n",
       " '.\\\\CollectExample\\\\4.pt',\n",
       " '.\\\\CollectExample\\\\5.pt',\n",
       " '.\\\\CollectExample\\\\6.pt',\n",
       " '.\\\\CollectExample\\\\7.pt']"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = os.path.join(dataset_path, \"*.pt\")\n",
    "glob.glob(pattern)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can use the `CollectedDataset` to load the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "tags": [
     "Load dataset"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'CollectExample'"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "collected_dataset = CollectedDataset(dataset_path, get_input=True, get_prediction=True, get_target=True)\n",
    "collected_dataset.name"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "It loads all the 32 saved samples:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "tags": [
     "Check dataset size"
    ]
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "32"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(collected_dataset)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can get the first sample of the dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "tags": [
     "View first sample"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Intercepted Output\n",
      "{'linear1': tensor([0.5083, 0.6371, 0.4391])} \n",
      "\n",
      "Target\n",
      "tensor([0.]) \n",
      "\n",
      "Prediction\n",
      "tensor([0.5722]) \n",
      "\n",
      "Input\n",
      "tensor([0., 0.]) \n",
      "\n"
     ]
    }
   ],
   "source": [
    "intercepted_output, target, prediction, saved_input = collected_dataset[0]\n",
    "\n",
    "print(\"Intercepted Output\")\n",
    "print(intercepted_output, \"\\n\")\n",
    "\n",
    "print(\"Target\")\n",
    "print(target, \"\\n\")\n",
    "\n",
    "print(\"Prediction\")\n",
    "print(prediction, \"\\n\")\n",
    "\n",
    "print(\"Input\")\n",
    "print(saved_input, \"\\n\")\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "And compare with the original values:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "tags": [
     "View original outputs"
    ]
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Original linear1 output\n",
      "tensor([0.5083, 0.6371, 0.4391]) \n",
      "\n",
      "Original target\n",
      "tensor([0.]) \n",
      "\n",
      "Original prediction\n",
      "tensor([0.5722]) \n",
      "\n",
      "Original input\n",
      "tensor([0., 0.]) \n",
      "\n"
     ]
    }
   ],
   "source": [
    "x, y = dataset[0]\n",
    "\n",
    "with torch.no_grad():\n",
    "    pred = model(x)\n",
    "\n",
    "    linear1_output = model.linear1(x)\n",
    "\n",
    "print(\"Original linear1 output\")\n",
    "print(linear1_output, \"\\n\")\n",
    "\n",
    "\n",
    "print(\"Original target\")\n",
    "print(y, \"\\n\")\n",
    "\n",
    "print(\"Original prediction\")\n",
    "print(pred, \"\\n\")\n",
    "\n",
    "print(\"Original input\")\n",
    "print(x, \"\\n\")\n",
    "\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Deletes the generated dataset:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "shutil.rmtree(dataset_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}