haesleinhuepf · haesleinhuepf · Oct 29, 2023 · Oct 29, 2023 · Oct 29, 2023 · Oct 29, 2023
diff --git a/demo/fine-tuning/10_extract_from_notebooks.ipynb b/demo/fine-tuning/10_extract_from_notebooks.ipynb
@@ -0,0 +1,206 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "2e3da9ae-74e1-4083-a793-39e083a84a29",
+   "metadata": {},
+   "source": [
+    "# Extract text and code from notebooks\n",
+    "In this notebook we will semi-automatically iterate through Python Jupyter Notebooks and extract a list of text-code pairs that might be useful for LLM fine tuning.\n",
+    "\n",
+    "The resulting data is stored in a jsonl file."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "2e12c7f0-e5ff-418c-bd4d-bf78a09ae588",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import openai\n",
+    "import time\n",
+    "from bia_bob import bob\n",
+    "import os\n",
+    "from bia_bob._utilities import filter_out_blacklist, save_jsonl_file\n",
+    "import json\n",
+    "import ipywidgets as widgets\n",
+    "from IPython.display import display"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "b1d52773-2153-4f70-8c58-0a8aaf40b53e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_data = []"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "b80ec64c-e1ec-46c6-88f4-eebe6042fb6a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def parse_notebook(notebook_path:str):\n",
+    "\n",
+    "    import nbformat\n",
+    "    \n",
+    "    # Reading the notebook\n",
+    "    with open(notebook_path, 'r', encoding='utf-8') as f:\n",
+    "        notebook = nbformat.read(f, as_version=4)\n",
+    "    \n",
+    "    metadata = notebook['metadata']\n",
+    "    \n",
+    "    last_text = \"\"\n",
+    "    last_code = \"\"\n",
+    "    \n",
+    "    first_text = \"\"\n",
+    "    first_code = \"\"\n",
+    "    all_code = \"\"\n",
+    "    \n",
+    "    cells = notebook['cells']\n",
+    "    for cell in cells:\n",
+    "        cell_type = cell['cell_type']\n",
+    "        if cell_type == 'code':\n",
+    "            # Access code cell\n",
+    "            last_code = last_code + \"\\n\\n\" + cell['source']\n",
+    "        elif cell_type == 'markdown':\n",
+    "            \n",
+    "            if len(last_code) > 0:\n",
+    "                # remove inital line breaks, tabs and spaces\n",
+    "                while last_code[0] in [\"\\n\", \"\\t\", \" \"]:\n",
+    "                    last_code = last_code[1:]\n",
+    "                    if len(last_code) == 0:\n",
+    "                        break\n",
+    "                    \n",
+    "                #print(\"----\", last_code[:6])\n",
+    "                if not (last_code.startswith(\"from\") or last_code.startswith(\"import\")):\n",
+    "                    # we skip the first block which contains a lot of introductory text and import statements\n",
+    "                    training_data.append({\n",
+    "                        \"imports\":first_code,\n",
+    "                        \"text\":last_text, \n",
+    "                        \"code\":last_code\n",
+    "                    })\n",
+    "                else:\n",
+    "                    # but we keep it for later\n",
+    "                    first_text = last_text\n",
+    "                    first_code = last_code\n",
+    "                all_code = all_code + \"\\n\\n\" + last_code\n",
+    "                last_code = \"\"\n",
+    "                last_text = \"\"\n",
+    "\n",
+    "            \n",
+    "            # Access markdown cell\n",
+    "            last_text = last_text + \"\\n\\n\" + cell['source']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "4881eab3-ae03-4f5e-8ed2-50bdeed35823",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'346 conversations extracted'"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "\n",
+    "def parse_notebooks(directory, extension=\".ipynb\"):\n",
+    "    for root, dirs, files in os.walk(directory):\n",
+    "        for file in files:\n",
+    "            if file.endswith(extension) and \".ipynb_checkpoints\" not in root:\n",
+    "                # print(os.path.join(root, file))\n",
+    "\n",
+    "                parse_notebook(os.path.join(root, file))\n",
+    "\n",
+    "parse_notebooks(r\"C:\\structure\\code\\pyclesperanto_prototype\\demo\")\n",
+    "\n",
+    "f\"{len(training_data)} conversations extracted\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "43d955be-02f6-4013-aa5d-668bb4df4b06",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'198 conversations remaining'"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "\n",
+    "training_data = filter_out_blacklist(training_data, [\n",
+    "    \"napari\",\n",
+    "    \"nbscreenshot\",\n",
+    "    \"def \",\n",
+    "    \"print\",\n",
+    "    \"openai\",\n",
+    "    \"https://\"\n",
+    "])\n",
+    "\n",
+    "f\"{len(training_data)} conversations remaining\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "b088341b-36b6-4e16-b366-060cdedcbf64",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "save_jsonl_file(training_data, \"cle_imports_text_code.jsonl\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "89df54c5-682a-424c-acbe-15b7e839e925",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.9.16"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}