Source code for locan.locan_io.files

"""
File manager

Identify, match, and group files to be batch-processed.
The class Files is a wrapper for a pandas.DataFrame with selected methods to
identify, match, and group file paths.
"""

from __future__ import annotations

import functools
import logging
import os
import re
import sys
from collections.abc import Iterable, Sequence
from pathlib import Path
from typing import Any

from pandas import DataFrame, Series

if sys.version_info >= (3, 11):
    from typing import Self
else:
    from typing_extensions import Self

import pandas as pd

from locan.locan_io.utilities import find_file_upstream

__all__: list[str] = ["Files"]

logger = logging.getLogger(__name__)



[docs]
class Files:
    """
    Wrapper for a pandas.DataFrame with selected methods to
    identify, match, and group file paths.

    Note
    ------
    Iteration and indexing is implemented in a way that integer indexing
    or iterating over the Files instance returns
    a single row (as Series or namedtuple).
    Slice indexing returns a new Files instance with selected rows.

    Parameters
    ----------
    df : pd.DataFrame | dict[str, str] | None
        file names
    directory : str | os.PathLike[Any] | None
        base directory
    exists : bool
        raise `FileExistsError` if file in `df` does not exist
    column : str
        key/column in `df` from which to take a file list

    Attributes
    ----------
    df : pd.DataFrame
        dataframe carrying file paths
    directory: Path
        base directory
    """

    def __init__(
        self,
        df: pd.DataFrame | dict[str, str] | None = None,
        directory: str | os.PathLike[Any] | None = None,
        exists: bool = True,
        column: str = "file_path",
    ) -> None:
        if directory is None:
            self.directory = None
        elif isinstance(directory, (str, os.PathLike)):
            self.directory = Path(directory)
            if not self.directory.exists():
                raise ValueError(f"The directory {directory} does not exist.")
        else:
            raise TypeError("The given directory is not a valid type.")

        if df is None:
            self.df = pd.DataFrame(columns=[column])
        elif isinstance(df, (pd.DataFrame, pd.Series)):
            if hasattr(df, column):
                self.df = df
            else:
                raise AttributeError(f"Dataframe must have column {column}.")
        else:
            try:
                self.df = pd.DataFrame(data=df[column], columns=[column])
            except AttributeError as err:
                raise AttributeError(f"Dataframe must have column {column}.") from err

        if (
            not self.df.empty
            and not self.df[column].apply(lambda x: isinstance(x, Path)).all()
        ):
            self.df[column] = self.df[column].astype("string").map(Path)

        if exists:
            mask = ~self.df[column].apply(lambda x: x.exists())
            non_existing = self.df[column][mask]
            if any(mask):
                raise FileExistsError(
                    f"The following files do not exist: {non_existing}"
                )

    def __iter__(self) -> Iterable[tuple[Any, ...]]:
        return_value: Iterable[tuple[Any, ...]] = self.df.itertuples(name="Files")
        return return_value

    def __getitem__(self, index: Any) -> Series[Any] | DataFrame | Files:
        if isinstance(index, int):
            return self.df.iloc[index]
        else:
            return Files(directory=self.directory, df=self.df.iloc[index])


[docs]
    @classmethod
    def concatenate(
        cls,
        files: Iterable[Files] | None = None,
        directory: str | os.PathLike[Any] | None = None,
        exists: bool = True,
    ) -> Files:
        """
        Concatenate the file lists from multiple File instances
        and set the base directory without further action.

        Parameters
        ----------
        files
            sequence with File instances
        directory
            new base directory
        exists
            raise `FileExistsError` if file in files does not exist

        Returns
        -------
        Files
        """
        if files is None:
            return cls()
        else:
            df = pd.concat([files_.df for files_ in files], join="inner")
            df = df.drop_duplicates().reset_index(drop=True)
            column = df.columns[0]
            return cls(df=df, directory=directory, exists=exists, column=column)



[docs]
    @classmethod
    def from_path(
        cls,
        files: Sequence[str | os.PathLike[Any]] | str | os.PathLike[Any] | None = None,
        directory: str | os.PathLike[Any] | None = None,
        column: str = "file_path",
    ) -> Files:
        """
        Instantiate `Files` from a collection of file paths.

        Parameters
        ----------
        files
            sequence with File instances
        directory
            new base directory
        column
            Name of column in `Files.df` carrying these files

        Returns
        -------
        Files
        """
        df: pd.DataFrame | dict[str, str] | None
        if isinstance(files, (str, os.PathLike)):
            df = pd.DataFrame(data=[files], columns=[column])
        elif isinstance(files, Iterable):
            df = pd.DataFrame(data=files, columns=[column])
        else:
            df = files
        return cls(directory=directory, df=df, column=column)



[docs]
    @classmethod
    def from_glob(
        cls,
        directory: str | os.PathLike[Any] | None = None,
        pattern: str = "*.txt",
        regex: str | None = None,
        column: str = "file_path",
    ) -> Files:
        """
        Instantiate `Files` from a search with glob and/or regex patterns.

        Parameters
        ----------
        pattern
            glob pattern passed to :func:`Path.glob`
        regex
            regex pattern passed to :func:`re.search` and applied in addition
            to glob pattern
        directory
            new base directory in which to search
        column
            Name of column in `Files.df` carrying these files

        Returns
        -------
        Files
        """
        if directory is None:
            directory = Path().cwd()
        else:
            directory = Path(directory)

        files = directory.glob(pattern)

        if regex is None:
            files = list(files)  # type: ignore[assignment]
        else:
            regex_ = re.compile(regex)
            files = [file_ for file_ in files if regex_.search(str(file_)) is not None]  # type: ignore[assignment]

        df = pd.DataFrame(data=files, columns=[column])  # type: ignore[arg-type]
        return cls(directory=directory, df=df, column=column)



[docs]
    def add_glob(
        self,
        pattern: str | None = "*.txt",
        regex: str | None = None,
        column: str = "other_file_path",
    ) -> Self:
        """
        Search for file paths using glob and/or regex pattern in base directory
        and provide files in new `column`.

        A logging.warning is given if the number of found files and those
        in `self.df` are different.

        Parameters
        ----------
        pattern
            glob pattern passed to :func:`Path.glob`
        regex
            regex pattern passed to :func:`re.search` and applied in addition
            to glob pattern
        column : str
            Name of column in `Files.df` carrying these files

        Returns
        -------
        Self
        """
        files = self.directory.glob(pattern)  # type: ignore

        if regex is None:
            files = list(files)
        else:
            regex_ = re.compile(regex)
            files = [file_ for file_ in files if regex_.search(str(file_)) is not None]

        self.df[column] = pd.Series(files)
        if len(self.df) != self.df[column].count():
            logger.warning("Not all files are matched.")
        return self



[docs]
    def exclude(
        self,
        stoplist: Files | Iterable[bool | str | os.PathLike[Any]] | None = None,
        column: str = "file_path",
        column_stoplist: str = "file_path",
    ) -> Self:
        """
        Exclude files in `self.df.column` according to stoplist.

        Parameters
        ----------
        stoplist
            Files to be excluded
        column
            key/column in `df` from which to exclude files
        column_stoplist
            key/column in `stoplist` from which to take files

        Returns
        -------
        Self
        """
        if stoplist is None or bool(stoplist) is False:
            pass
        elif not isinstance(stoplist, Files) and all(
            isinstance(item, bool) for item in stoplist
        ):
            selection = [not item_ for item_ in stoplist]
            self.df = self.df[selection]
        else:
            if isinstance(stoplist, Files):
                stoplist = stoplist.df[column_stoplist].astype("string")
            else:
                try:
                    stoplist = [str(item_[column_stoplist]) for item_ in stoplist]  # type: ignore[index]
                except TypeError:
                    stoplist = [str(item_) for item_ in stoplist]

            if len(stoplist) != 0:  # type: ignore[arg-type]
                conditions = [
                    self.df[column].astype("string").str.contains(item_, regex=False)  # type: ignore[arg-type]
                    for item_ in stoplist  # type: ignore
                ]
                mask = functools.reduce(lambda x, y: x | y, conditions)
                self.df = self.df[~mask]
        return self



[docs]
    def match_files(
        self,
        files: pd.Series[Any],
        column: str = "file_path",
        other_column: str = "other_file_path",
    ) -> Self:
        """
        Add files in new column.

        A logging.warning is given if the number of files and those
        in `self.df` are different.

        Parameters
        ----------
        files
            New file list
        column
            Name of column in `Files.df` carrying files to match
        other_column
            Name of new column carrying files

        Returns
        -------
        Self
        """
        self.df[other_column] = files
        if self.df[column].count() != self.df[other_column].count():
            logger.warning("Not all files are matched.")
        return self



[docs]
    def match_file_upstream(
        self,
        column: str = "file_path",
        pattern: str | None = "*.toml",
        regex: str | None = None,
        directory: str | os.PathLike[Any] | None = None,
        other_column: str = "metadata",
    ) -> Self:
        """
        Find a matching file by applying :func:`locan.find_file_upstream` on
        each file in `self.df[column]`.

        Parameters
        ----------
        column
            Name of column in `Files.df` carrying files to match
        pattern
            glob pattern passed to :func:`Path.glob`
        regex
            regex pattern passed to :func:`re.search` and applied in addition
            to glob pattern
        directory
            top directory in which to search
        other_column
            Name of new column carrying files

        Returns
        -------
        Self
        """
        matched_file = [
            find_file_upstream(
                sub_directory=file_,
                pattern=pattern,
                top_directory=directory,
                regex=regex,
            )
            for file_ in self.df[column]
        ]
        self.df[other_column] = matched_file
        return self



[docs]
    def print_summary(self) -> None:
        """
        Print summary of Files.

        Returns
        -------
        None
        """
        print(f"Number of files: {len(self.df)}")
        print(f"Base directory: {self.directory}")
        print(f"Columns: {self.df.columns}")
        print(self.df.describe().loc[["count", "unique"]])  # type: ignore[index]



[docs]
    def group_identifiers(self) -> Any:  # todo: fix type
        """
        Get categories defined in self.df.group.

        Returns
        -------
        categories
        """
        return self.df.group.cat.categories



[docs]
    def grouped(self) -> pd.core.groupby.DataFrameGroupBy[Any]:  # type: ignore[type-arg]
        """
        Get groupby instance based on group_identifiers.

        Returns
        -------
        pandas.core.groupby.DataFrameGroupBy
        """
        return self.df.groupby(by="group", observed=True)



[docs]
    def set_group_identifier(
        self,
        name: str | None = None,
        pattern: str | None = None,
        glob: str | None = None,
        regex: str | None = None,
        column: str = "file_path",
    ) -> Self:
        """
        Set group_identifier `name` for files in `column` as identified by
        string pattern and/or glob pattern and/or regex
        and keep them in column "group".

        Parameters
        ----------
        name
            new group_identifier
        pattern
            string pattern
        glob
            glob pattern passed to :func:`Path.match`
        regex
            regex pattern
        column
            Name of column in `Files.df` carrying files to match

        Returns
        -------
        Self
        """
        if all(key_ is None for key_ in [pattern, glob, regex]):
            return self

        if pattern is not None:
            self.df["_mask_pattern"] = (
                self.df[column].astype("string").str.contains(pattern, regex=False)
            )

        if glob is not None:
            self.df["_mask_glob"] = self.df[column].apply(lambda x: x.match(glob))

        if regex is not None:
            self.df["_mask_regex"] = (
                self.df[column].astype("string").str.contains(regex, regex=True)
            )

        mask_columns = [
            key_
            for key_ in ["_mask_pattern", "_mask_glob", "_mask_regex"]
            if key_ in self.df.columns
        ]

        if len(mask_columns) == 1:
            self.df["mask"] = self.df[mask_columns]
        else:
            self.df["mask"] = self.df[mask_columns].all(axis=1)

        if "group" not in self.df.columns:
            self.df["group"] = pd.NA
            self.df["group"] = self.df["group"].astype("category")
        else:
            if self.df.loc[self.df["mask"], "group"].notna().any():
                logger.warning(f"Previously defined groups are overwritten with {name}")

        if name not in self.df["group"].cat.categories:
            self.df["group"] = self.df["group"].cat.add_categories([name])
        self.df.loc[self.df["mask"], "group"] = name
        self.df = self.df.drop(columns=mask_columns + ["mask"])

        if self.df.group.isna().any():
            logger.info("Some group identifiers are still NAN.")
        return self