Source code for fogdb.raw

# src/fogdb/raw/__init__.py
"""Module providing raw data parsing capabilities."""
import abc
import os
from collections import defaultdict


[docs]def to_dict(handler): """Read in raw data. Parameters ---------- handler Handler class providing key interfacing capabilities. See :mod:`fogdb.raw.lcl` or :mod:`fogdb.raw.smb` for examples. Returns ------- tuple Tuple of quadruple nested dicts holding the data keyed by subcategory, keyed by filename - ending, keyed by (found) :paramref:`~read_raw.categories` as in:: returned_tuple = ( { "crawford": { # category "common_fruiting_trees": { # subcategory "Cydonia_oblonga": { # filename - ending "common_names": "Quince", "USDA_hardiness": 4, # ..., }, }, "less_common_fruiting_trees": { # another subcategory "Armelancher_canadensis": { # filename - ending "common_names": "Juneberry", "USDA_hardiness": 4, # ..., }, }, }, }, { "jacke": { # another category "plant_matrix": { # source specific subcategory "Cydonia_oblonga": { # filename - ending "common_names": "Quince", "USDA_hardiness": 4, # ..., }, }, }, }, ) """ # create a file tree of path tree = handler.map_raw_data_file_tree() # reuse tree dict, cause only the lowest level data change raw_data_map = tree.copy() for cat, subcats in tree.items(): for subcat, files in subcats.items(): raw_data_map[cat][subcat] = {} for fle in files: # rile reading is handled by file specific parsers raw_data_map[cat][subcat][ fle.split(".")[0] ] = handler.map_source_file_data(os.path.join(cat, subcat, fle)) return raw_data_map
# pylint: disable=too-few-public-methods # the BaseHandler provides only one public methods, but a bun of key privates
[docs]class BaseHandler: """Partly abstract Handler base class for mapping raw data. Parameters ---------- categories: str, list, default="all" String or list of strings specifying which categories (i.e. sublevel folders) are used for reading in the data. If ``"all"`` is used, all sublevel folders are traversed. Can be something like ``"crawford"``, ``"jacke"``, ``"myRand0mSUBf0lder"``, ... dtype: str, default="txt" String specifying the data type of the raw datafiles. If ``"all"`` is used, data type is not filtered. Can be something like ``"rst"``, ``"cfg"``, ... excl_dirs: ~collections.abc.Container Container of strings specifying folder names to excluded during the mapping. """ def __init__(self, categories="all", dtype="txt", excl_dirs=("FRITZ",)): self.categories = categories self.dtype = dtype self.exclude = excl_dirs
[docs] def map_raw_data_file_tree(self): """Return mapped file tree of the expected :ref:`raw_data` strucure.""" cats = self._list_top_level_folders(relative_file_path=".") if self.categories != "all": cats = [cat for cat in cats if cat in self.categories] cat_subcat_map = { cat: self._list_top_level_folders(relative_file_path=os.path.join(".", cat)) for cat in cats } file_tree = defaultdict(dict) for cat, subcats in cat_subcat_map.items(): for subcat in subcats: file_tree[cat][subcat] = [] for filename in self._list_files( relative_file_path=os.path.join(".", cat, subcat) ): file_tree[cat][subcat].append(filename) return dict(file_tree)
@abc.abstractmethod def _list_files(self, relative_file_path): """Return sorted and filtered list of file names found in relative_file_path.""" @abc.abstractmethod def _list_top_level_folders(self, relative_file_path): """Return alphabetically sorted list of folder names found in relative_file_path.""" def _sort_and_filter_file_list(self, file_list): sorted_files = sorted(file_list) if self.dtype != "all": filtered_files = [ fle for fle in sorted_files if fle.split(".")[-1] == self.dtype ] else: filtered_files = sorted_files return filtered_files def _infer_file_type(self, file_string): return str(file_string).rsplit(".", maxsplit=1)[-1]
# pylint: enable=too-few-public-methods