{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Tutorial about managing files in batch processing"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "When analysing a set of experiments you want to collect, match and group files according to information content and experimental conditions.\n",
    "\n",
    "The Files class will help you."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "from pathlib import Path\n",
    "import tempfile\n",
    "\n",
    "%matplotlib inline\n",
    "\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "import locan as lc"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "lc.show_versions(system=False, dependencies=False, verbose=False)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Some file structure to be analysed"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "directory = Path(tempfile.mkdtemp())\n",
    "subdirectory = directory.joinpath(\"sub_directory\")\n",
    "subdirectory.mkdir()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = [\n",
    "    directory / \"sub_directory\" / \"file_group_a_0.data\",\n",
    "    directory / \"sub_directory\" / \"file_group_a_1.data\",\n",
    "    directory / \"sub_directory\" / \"file_group_b_2.data\",\n",
    "    directory / \"sub_directory\" / \"corresponding_file_0.data\",\n",
    "    directory / \"metadata.meta\",\n",
    "]\n",
    "for file_ in files:\n",
    "    file_.touch()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "list(directory.glob(\"**/*.*\"))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## The Files class"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "lc.Files?"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Identify files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/*.*\"\n",
    ")\n",
    "files.df"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "For each file a Path object is stored:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.print_summary()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Exclude files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/*.*\"\n",
    ")\n",
    "\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "stoplist = lc.Files.concatenate([\n",
    "    lc.Files.from_glob(directory=files.directory, pattern=\"**/*.meta\"),\n",
    "    lc.Files.from_glob(directory=files.directory, pattern=\"**/*group_b*.*\")\n",
    "])\n",
    "stoplist.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.exclude(stoplist=stoplist)\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Match corresponding files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/*.*\",\n",
    "    regex=\"group_a_0\"\n",
    ")\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "corresponding_files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/*.*\",\n",
    "    regex=\"corresponding\"\n",
    ")\n",
    "corresponding_files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.match_files(files=corresponding_files.df)\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Match metadata files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/*.*\",\n",
    "    regex=\"group_a_0\"\n",
    ")\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.match_file_upstream(pattern=\"*.meta\")\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Group files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/file*.data\"\n",
    ")\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.set_group_identifier(name=\"A\", pattern=\"group_a\")\n",
    "files.df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.set_group_identifier(name=\"B\", pattern=\"group_b\")\n",
    "files.df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files.group_identifiers()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "grouped = files.grouped()\n",
    "grouped.groups"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Indexing and iterating over files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files = lc.Files.from_glob(\n",
    "    directory=directory,\n",
    "    pattern=\"**/file*.data\"\n",
    ")\n",
    "files.df.applymap(lambda x: x.name)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Slicing Files yield a new Files instance:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files[0:3]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Indexing Files yields a Series with the selected row:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "files[0]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Iterating over Files yields a namedtuple  for each row: "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "for file in files:\n",
    "    print(file)\n",
    "    print(file.file_path)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.9"
  },
  "widgets": {
   "application/vnd.jupyter.widget-state+json": {
    "state": {},
    "version_major": 2,
    "version_minor": 0
   }
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}