{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# Tutorial about managing files in batch processing" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "When analysing a set of experiments you want to collect, match and group files according to information content and experimental conditions.\n", "\n", "The Files class will help you." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "from pathlib import Path\n", "import tempfile\n", "\n", "%matplotlib inline\n", "\n", "import matplotlib.pyplot as plt\n", "\n", "import locan as lc" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "lc.show_versions(system=False, dependencies=False, verbose=False)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Some file structure to be analysed" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "directory = Path(tempfile.mkdtemp())\n", "subdirectory = directory.joinpath(\"sub_directory\")\n", "subdirectory.mkdir()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = [\n", " directory / \"sub_directory\" / \"file_group_a_0.data\",\n", " directory / \"sub_directory\" / \"file_group_a_1.data\",\n", " directory / \"sub_directory\" / \"file_group_b_2.data\",\n", " directory / \"sub_directory\" / \"corresponding_file_0.data\",\n", " directory / \"metadata.meta\",\n", "]\n", "for file_ in files:\n", " file_.touch()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "list(directory.glob(\"**/*.*\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## The Files class" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "lc.Files?" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Identify files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/*.*\"\n", ")\n", "files.df" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "For each file a Path object is stored:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.print_summary()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Exclude files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/*.*\"\n", ")\n", "\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "stoplist = lc.Files.concatenate([\n", " lc.Files.from_glob(directory=files.directory, pattern=\"**/*.meta\"),\n", " lc.Files.from_glob(directory=files.directory, pattern=\"**/*group_b*.*\")\n", "])\n", "stoplist.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.exclude(stoplist=stoplist)\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Match corresponding files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/*.*\",\n", " regex=\"group_a_0\"\n", ")\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "corresponding_files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/*.*\",\n", " regex=\"corresponding\"\n", ")\n", "corresponding_files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.match_files(files=corresponding_files.df)\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Match metadata files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/*.*\",\n", " regex=\"group_a_0\"\n", ")\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.match_file_upstream(pattern=\"*.meta\")\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Group files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/file*.data\"\n", ")\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.set_group_identifier(name=\"A\", pattern=\"group_a\")\n", "files.df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.set_group_identifier(name=\"B\", pattern=\"group_b\")\n", "files.df" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files.group_identifiers()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "grouped = files.grouped()\n", "grouped.groups" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Indexing and iterating over files" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files = lc.Files.from_glob(\n", " directory=directory,\n", " pattern=\"**/file*.data\"\n", ")\n", "files.df.applymap(lambda x: x.name)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Slicing Files yield a new Files instance:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files[0:3]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Indexing Files yields a Series with the selected row:" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "files[0]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Iterating over Files yields a namedtuple for each row: " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "tags": [] }, "outputs": [], "source": [ "for file in files:\n", " print(file)\n", " print(file.file_path)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.9" }, "widgets": { "application/vnd.jupyter.widget-state+json": { "state": {}, "version_major": 2, "version_minor": 0 } } }, "nbformat": 4, "nbformat_minor": 4 }