Source code for locan.locan_io.files

"""
File manager

Identify, match, and group files to be batch-processed.
The class Files is a wrapper for a pandas.DataFrame with selected methods to
identify, match, and group file paths.
"""

from __future__ import annotations

import functools
import logging
import os
import re
import sys
from collections.abc import Iterable, Sequence
from pathlib import Path
from typing import Any

from pandas import DataFrame, Series

if sys.version_info >= (3, 11):
    from typing import Self
else:
    from typing_extensions import Self

import pandas as pd

from locan.locan_io.utilities import find_file_upstream

__all__: list[str] = ["Files"]

logger = logging.getLogger(__name__)


[docs] class Files: """ Wrapper for a pandas.DataFrame with selected methods to identify, match, and group file paths. Note ------ Iteration and indexing is implemented in a way that integer indexing or iterating over the Files instance returns a single row (as Series or namedtuple). Slice indexing returns a new Files instance with selected rows. Parameters ---------- df : pd.DataFrame | dict[str, str] | None file names directory : str | os.PathLike[Any] | None base directory exists : bool raise `FileExistsError` if file in `df` does not exist column : str key/column in `df` from which to take a file list Attributes ---------- df : pd.DataFrame dataframe carrying file paths directory: Path base directory """ def __init__( self, df: pd.DataFrame | dict[str, str] | None = None, directory: str | os.PathLike[Any] | None = None, exists: bool = True, column: str = "file_path", ) -> None: if directory is None: self.directory = None elif isinstance(directory, (str, os.PathLike)): self.directory = Path(directory) if not self.directory.exists(): raise ValueError(f"The directory {directory} does not exist.") else: raise TypeError("The given directory is not a valid type.") if df is None: self.df = pd.DataFrame(columns=[column]) elif isinstance(df, (pd.DataFrame, pd.Series)): if hasattr(df, column): self.df = df else: raise AttributeError(f"Dataframe must have column {column}.") else: try: self.df = pd.DataFrame(data=df[column], columns=[column]) except AttributeError as err: raise AttributeError(f"Dataframe must have column {column}.") from err if ( not self.df.empty and not self.df[column].apply(lambda x: isinstance(x, Path)).all() ): self.df[column] = self.df[column].astype("string").map(Path) if exists: mask = ~self.df[column].apply(lambda x: x.exists()) non_existing = self.df[column][mask] if any(mask): raise FileExistsError( f"The following files do not exist: {non_existing}" ) def __iter__(self) -> Iterable[tuple[Any, ...]]: return_value: Iterable[tuple[Any, ...]] = self.df.itertuples(name="Files") return return_value def __getitem__(self, index: Any) -> Series[Any] | DataFrame | Files: if isinstance(index, int): return self.df.iloc[index] else: return Files(directory=self.directory, df=self.df.iloc[index])
[docs] @classmethod def concatenate( cls, files: Iterable[Files] | None = None, directory: str | os.PathLike[Any] | None = None, exists: bool = True, ) -> Files: """ Concatenate the file lists from multiple File instances and set the base directory without further action. Parameters ---------- files sequence with File instances directory new base directory exists raise `FileExistsError` if file in files does not exist Returns ------- Files """ if files is None: return cls() else: df = pd.concat([files_.df for files_ in files], join="inner") df = df.drop_duplicates().reset_index(drop=True) column = df.columns[0] return cls(df=df, directory=directory, exists=exists, column=column)
[docs] @classmethod def from_path( cls, files: Sequence[str | os.PathLike[Any]] | str | os.PathLike[Any] | None = None, directory: str | os.PathLike[Any] | None = None, column: str = "file_path", ) -> Files: """ Instantiate `Files` from a collection of file paths. Parameters ---------- files sequence with File instances directory new base directory column Name of column in `Files.df` carrying these files Returns ------- Files """ df: pd.DataFrame | dict[str, str] | None if isinstance(files, (str, os.PathLike)): df = pd.DataFrame(data=[files], columns=[column]) elif isinstance(files, Iterable): df = pd.DataFrame(data=files, columns=[column]) else: df = files return cls(directory=directory, df=df, column=column)
[docs] @classmethod def from_glob( cls, directory: str | os.PathLike[Any] | None = None, pattern: str = "*.txt", regex: str | None = None, column: str = "file_path", ) -> Files: """ Instantiate `Files` from a search with glob and/or regex patterns. Parameters ---------- pattern glob pattern passed to :func:`Path.glob` regex regex pattern passed to :func:`re.search` and applied in addition to glob pattern directory new base directory in which to search column Name of column in `Files.df` carrying these files Returns ------- Files """ if directory is None: directory = Path().cwd() else: directory = Path(directory) files = directory.glob(pattern) if regex is None: files = list(files) # type: ignore[assignment] else: regex_ = re.compile(regex) files = [file_ for file_ in files if regex_.search(str(file_)) is not None] # type: ignore[assignment] df = pd.DataFrame(data=files, columns=[column]) # type: ignore[arg-type] return cls(directory=directory, df=df, column=column)
[docs] def add_glob( self, pattern: str | None = "*.txt", regex: str | None = None, column: str = "other_file_path", ) -> Self: """ Search for file paths using glob and/or regex pattern in base directory and provide files in new `column`. A logging.warning is given if the number of found files and those in `self.df` are different. Parameters ---------- pattern glob pattern passed to :func:`Path.glob` regex regex pattern passed to :func:`re.search` and applied in addition to glob pattern column : str Name of column in `Files.df` carrying these files Returns ------- Self """ files = self.directory.glob(pattern) # type: ignore if regex is None: files = list(files) else: regex_ = re.compile(regex) files = [file_ for file_ in files if regex_.search(str(file_)) is not None] self.df[column] = pd.Series(files) if len(self.df) != self.df[column].count(): logger.warning("Not all files are matched.") return self
[docs] def exclude( self, stoplist: Files | Iterable[bool | str | os.PathLike[Any]] | None = None, column: str = "file_path", column_stoplist: str = "file_path", ) -> Self: """ Exclude files in `self.df.column` according to stoplist. Parameters ---------- stoplist Files to be excluded column key/column in `df` from which to exclude files column_stoplist key/column in `stoplist` from which to take files Returns ------- Self """ if stoplist is None or bool(stoplist) is False: pass elif not isinstance(stoplist, Files) and all( isinstance(item, bool) for item in stoplist ): selection = [not item_ for item_ in stoplist] self.df = self.df[selection] else: if isinstance(stoplist, Files): stoplist = stoplist.df[column_stoplist].astype("string") else: try: stoplist = [str(item_[column_stoplist]) for item_ in stoplist] # type: ignore[index] except TypeError: stoplist = [str(item_) for item_ in stoplist] if len(stoplist) != 0: # type: ignore[arg-type] conditions = [ self.df[column].astype("string").str.contains(item_, regex=False) # type: ignore[arg-type] for item_ in stoplist # type: ignore ] mask = functools.reduce(lambda x, y: x | y, conditions) self.df = self.df[~mask] return self
[docs] def match_files( self, files: pd.Series[Any], column: str = "file_path", other_column: str = "other_file_path", ) -> Self: """ Add files in new column. A logging.warning is given if the number of files and those in `self.df` are different. Parameters ---------- files New file list column Name of column in `Files.df` carrying files to match other_column Name of new column carrying files Returns ------- Self """ self.df[other_column] = files if self.df[column].count() != self.df[other_column].count(): logger.warning("Not all files are matched.") return self
[docs] def match_file_upstream( self, column: str = "file_path", pattern: str | None = "*.toml", regex: str | None = None, directory: str | os.PathLike[Any] | None = None, other_column: str = "metadata", ) -> Self: """ Find a matching file by applying :func:`locan.find_file_upstream` on each file in `self.df[column]`. Parameters ---------- column Name of column in `Files.df` carrying files to match pattern glob pattern passed to :func:`Path.glob` regex regex pattern passed to :func:`re.search` and applied in addition to glob pattern directory top directory in which to search other_column Name of new column carrying files Returns ------- Self """ matched_file = [ find_file_upstream( sub_directory=file_, pattern=pattern, top_directory=directory, regex=regex, ) for file_ in self.df[column] ] self.df[other_column] = matched_file return self
[docs] def print_summary(self) -> None: """ Print summary of Files. Returns ------- None """ print(f"Number of files: {len(self.df)}") print(f"Base directory: {self.directory}") print(f"Columns: {self.df.columns}") print(self.df.describe().loc[["count", "unique"]]) # type: ignore[index]
[docs] def group_identifiers(self) -> Any: # todo: fix type """ Get categories defined in self.df.group. Returns ------- categories """ return self.df.group.cat.categories
[docs] def grouped(self) -> pd.core.groupby.DataFrameGroupBy[Any]: # type: ignore[type-arg] """ Get groupby instance based on group_identifiers. Returns ------- pandas.core.groupby.DataFrameGroupBy """ return self.df.groupby(by="group", observed=True)
[docs] def set_group_identifier( self, name: str | None = None, pattern: str | None = None, glob: str | None = None, regex: str | None = None, column: str = "file_path", ) -> Self: """ Set group_identifier `name` for files in `column` as identified by string pattern and/or glob pattern and/or regex and keep them in column "group". Parameters ---------- name new group_identifier pattern string pattern glob glob pattern passed to :func:`Path.match` regex regex pattern column Name of column in `Files.df` carrying files to match Returns ------- Self """ if all(key_ is None for key_ in [pattern, glob, regex]): return self if pattern is not None: self.df["_mask_pattern"] = ( self.df[column].astype("string").str.contains(pattern, regex=False) ) if glob is not None: self.df["_mask_glob"] = self.df[column].apply(lambda x: x.match(glob)) if regex is not None: self.df["_mask_regex"] = ( self.df[column].astype("string").str.contains(regex, regex=True) ) mask_columns = [ key_ for key_ in ["_mask_pattern", "_mask_glob", "_mask_regex"] if key_ in self.df.columns ] if len(mask_columns) == 1: self.df["mask"] = self.df[mask_columns] else: self.df["mask"] = self.df[mask_columns].all(axis=1) if "group" not in self.df.columns: self.df["group"] = pd.NA self.df["group"] = self.df["group"].astype("category") else: if self.df.loc[self.df["mask"], "group"].notna().any(): logger.warning(f"Previously defined groups are overwritten with {name}") if name not in self.df["group"].cat.categories: self.df["group"] = self.df["group"].cat.add_categories([name]) self.df.loc[self.df["mask"], "group"] = name self.df = self.df.drop(columns=mask_columns + ["mask"]) if self.df.group.isna().any(): logger.info("Some group identifiers are still NAN.") return self