Return to Answer

There are no longer any errors in the code. I have also added some small doctests to show how the more complex functions work.

Source Link

edited Jan 23, 2020 at 15:53

Peilonrayz ♦

44.6k
7
80
158

import os
import pprint
import timeenum
import logging
import os
import pathlib
import pprint
import time

from typing import Any, Callable, ClassVar, Dict, Generic, Iterator, List, Optional, Set, Tuple, TypeVar

TItem = TypeVar('TItem')
TFlag = TypeVar('TFlag')


MAX_AGE = 90

# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS

PATH_TO_FILES = pathlib.Path("path/to/local/copy/of/lib/"./lib")
EXT = ".pdf"
WHITELISTLANGUAGES = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']


class BaseClassifier(Generic[TItem, TFlag]):
    """
    Classifier metaclass to ease classifying values.

    This is a fairly magical class that allows setting the required state.
    It also allows validating against all validators starting with `_valid_`,
    without having to manually call each of them.
    """
    DEFAULT =: 0ClassVar[TFlag]

    def __init__(self, **kwargs: Any):
        """
        Initialize classifier state.

        Values must be passed as keywords so they can be set correctly
        on the instance. For each keyword provided an attribute with the
        same name is created and has the value set to the keywords value.

        For example we can set the attribute `foo` by passing it as a keyword.

            >>> bc = BaseClassifier(foo='bar')
            >>> bc.foo
            'bar'
        """
        for name, value in kwargs.items():
            setattr(self, name, value)

    def _get_validators(self) -> List[Callable[[TItem], Optional[TFlag]]]:
        """Get"""
        Get all validators on the instance.

        All validators must start `_valid_` and so we loop through all
        the names of the attributes on the instance, via `dir`, and
        filter to ones that start with `_valid_`. For each name that
        starts with `_valid_` we return the value of the attribute.

        For example `Classifier` has the following validators.
        Note that the test is for the function names, but it actually
        returns the full fat function. Just getting the test to work
        with the full fat functions is a PITA.

            >>> validators = Classifier()._get_validators()
            >>> [validator.__name__ for validator in validators]
            ['_valid_age', '_valid_extension', '_valid_language']
        """
        return [
            getattr(self, name)
            for name in dir(self)
            if name.startswith('_valid_')
        ]

    def validate(self, items: Iterator[TItem]) -> Iterator[Tuple[TFlag, TItem]]:
        """Validate"""
        Validate input against the instance's validators.
        
        For each provided item we run all validators to build a complete 
        picture of any problems with the item. Once all validators have
        ran we output the item as is, but with any statuses that the
        validators have assigned to it.
        
        For example the test below shows that 123_it.txt has both an
        invalid language and file type as it's flag is FileState.INVALID.

            >>> classifier = Classifier(\
                ext=EXT,\
                languages=LANGUAGES,\
                time_limit=time.time() - (MAX_AGE * DAYS),\
            )
            >>> list(classifier.validate([pathlib.Path('./lib/IFM/123_it.txt')]))
            [(<FileState.INVALID: 3>, WindowsPath('lib/IFM/123_it.txt'))]
        """
        functions = self._get_validators()
        for item in items:
            ret = self.DEFAULT
            for function in functions:
                output = function(item)
                if output is not None:
                    ret |= output
            yield ret, item


class FileState(enum.IntFlag):
    """States that files can be in."""
    CORRECT = 0
    EXTENSION = enum.auto()
    LANGUAGE = enum.auto()
    OUT_OF_DATE = enum.auto()
    
    # A convenience for your old categories.
    # only used in the backward-compatible `classify_paths`
    INVALID = EXTENSION | LANGUAGE


class Classifier(BaseClassifierBaseClassifier[pathlib.Path, FileState]):
    """Classifiers for files."""
    __slots__ = ('ext', 'languages', 'time_limit')
    DEFAULT: ClassVar[FileState] = FileState.CORRECT
    
    ext: str
    languages: Set[str]
    time_limit: int

    def _valid_extension(self, path: pathlib.Path) -> Optional[FileState]:
        """Validate if path has correct extension."""
        if path.suffix != self.ext:
            return FileState.EXTENSION

    def _valid_language(self, path: pathlib.Path) -> Optional[FileState]:
        """Validate if path has correct language."""
        if path.stem[-3:] not in self.languages:
            return FileState.LANGUAGE

    def _valid_age(self, path: pathlib.Path) -> Optional[FileState]:
        """Validate if file is out of date."""
        if path.stat().st_mtime < self.time_limit:
            return FileState.OUT_OF_DATE


# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(
    classifier: BaseClassifier,
    paths: Iterator[pathlib.Path],
) -> Tuple[Dict[FileState, List[pathlib.Path]], int]:
    """Group classifications."""
    report: Dict[FileState, List[pathlib.Path]] = {}
    for count, (state, path) in enumerate(classifier.validate(paths)):
        report.setdefault(state, []).append(path)
    return report, count


# Complex as I make the output exactly the same as the question.
def classify_paths(
    classifier: BaseClassifier,
    paths: Iterator[pathlib.Path],
) -> Tuple[Dict[str, List[pathlib.Path]], int]:
    """Group classifications into generic custom types."""
    report: Dict[str, List[pathlib.Path]] = {
        'Correct': [],
        'Invalid': [],
        'OutOfDate': [],
    }
    for count, (state, path) in enumerate(classifier.validate(paths)):
        if state == FileState.CORRECT:
            key = 'Correct'
        elif state & FileState.INVALID:
            key = 'Invalid'
        else:
            key = 'OutOfDate'
        report[key].append(path)
    return report, count


def main():
    classifier = Classifier(
        ext=EXT,
        languages=LANGUAGES,
        time_limit=time.time() - limit(MAX_AGE * DAYS),
    )
    for supplier in SUPPLIERS:
        logging.debug(supplier)
        # Also use classify_paths_simple to contrast the outputs
        result, count = classify_paths(
            classifier,
            (PATH_TO_FILES / supplier).iterdir()
        )
        pprint.pprint(result)
        logging.info("%s files checked", count)

if __name__ == '__main__':
    main()

    import doctest
    doctest.testmod()

import os
import pprint
import time
import logging
import pathlib

from typing import Set


MAX_AGE = 90

# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS

PATH_TO_FILES = pathlib.Path("path/to/local/copy/of/lib/")
EXT = ".pdf"
WHITELIST = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']


class BaseClassifier:
    """
    Classifier metaclass to ease classifying values.

    This is a fairly magical class that allows setting the required state.
    It also allows validating against all validators starting with `_valid_`,
    without having to manually call each of them.
    """
    DEFAULT = 0

    def __init__(self, **kwargs):
        """
        Initialize classifier state.

        Values must be passed as keywords so they can be set correctly on the instance.
        """
        for name, value in kwargs.items():
            setattr(self, name, value)

    def _get_validators(self):
        """Get all validators on the instance."""
        return [
            getattr(self, name)
            for name in dir(self)
            if name.startswith('_valid_')
        ]

    def validate(self, items):
        """Validate input against the instance's validators."""
        functions = self._get_validators()
        for item in items:
            ret = self.DEFAULT
            for function in functions:
                output = function(item)
                if output is not None:
                    ret |= output
            yield ret, item


class FileState(enum.IntFlag):
    """States that files can be in."""
    CORRECT = 0
    EXTENSION = enum.auto()
    LANGUAGE = enum.auto()
    OUT_OF_DATE = enum.auto()
    
    # A convenience for your old categories.
    # only used in the backward-compatible `classify_paths`
    INVALID = EXTENSION | LANGUAGE


class Classifier(BaseClassifier):
    """Classifiers for files."""
    __slots__ = ('ext', 'languages', 'time_limit')
    DEFAULT = FileState.CORRECT
    
    ext: str
    languages: Set[str]
    time_limit: int

    def _valid_extension(self, path):
        """Validate if path has correct extension."""
        if path.suffix != self.ext:
            return FileState.EXTENSION

    def _valid_language(self, path):
        """Validate if path has correct language."""
        if path.stem[-3:] not in self.languages:
            return FileState.LANGUAGE

    def _valid_age(self, path):
        """Validate if file is out of date."""
        if path.stat().st_mtime < self.time_limit:
            return FileState.OUT_OF_DATE


# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(classifier, paths):
    """Group classifications."""
    report = {}
    for count, (state, path) in enumerate(classifier.validate(paths)):
        report.setdefault(state, []).append(path)
    return report, count


# Complex as I make the output exactly the same as the question.
def classify_paths(classifier, paths):
    """Group classifications into generic custom types."""
    report = {
        'Correct': [],
        'Invalid': [],
        'OutOfDate': [],
    }
    for count, (state, path) in enumerate(classifier.validate(paths)):
        if state == FileState.CORRECT:
            key = 'Correct'
        elif state & FileState.INVALID:
            key = 'Invalid'
        else:
            key = 'OutOfDate'
        report[key].append(path)
    return report, count


def main():
    classifier = Classifier(
        ext=EXT,
        languages=LANGUAGES,
        time_limit=time.time() - limit,
    )
    for supplier in SUPPLIERS:
        logging.debug(supplier)
        result, count = classify_paths(
            classifier,
            (PATH_TO_FILES / supplier).iterdir()
        )
        pprint.pprint(result)
        logging.info("%s files checked", count)

if __name__ == '__main__':
    main()

import enum
import logging
import os
import pathlib
import pprint
import time

from typing import Any, Callable, ClassVar, Dict, Generic, Iterator, List, Optional, Set, Tuple, TypeVar

TItem = TypeVar('TItem')
TFlag = TypeVar('TFlag')


MAX_AGE = 90

# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS

PATH_TO_FILES = pathlib.Path("./lib")
EXT = ".pdf"
LANGUAGES = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']


class BaseClassifier(Generic[TItem, TFlag]):
    """
    Classifier metaclass to ease classifying values.

    This is a fairly magical class that allows setting the required state.
    It also allows validating against all validators starting with `_valid_`,
    without having to manually call each of them.
    """
    DEFAULT: ClassVar[TFlag]

    def __init__(self, **kwargs: Any):
        """
        Initialize classifier state.

        Values must be passed as keywords so they can be set correctly
        on the instance. For each keyword provided an attribute with the
        same name is created and has the value set to the keywords value.

        For example we can set the attribute `foo` by passing it as a keyword.

            >>> bc = BaseClassifier(foo='bar')
            >>> bc.foo
            'bar'
        """
        for name, value in kwargs.items():
            setattr(self, name, value)

    def _get_validators(self) -> List[Callable[[TItem], Optional[TFlag]]]:
        """
        Get all validators on the instance.

        All validators must start `_valid_` and so we loop through all
        the names of the attributes on the instance, via `dir`, and
        filter to ones that start with `_valid_`. For each name that
        starts with `_valid_` we return the value of the attribute.

        For example `Classifier` has the following validators.
        Note that the test is for the function names, but it actually
        returns the full fat function. Just getting the test to work
        with the full fat functions is a PITA.

            >>> validators = Classifier()._get_validators()
            >>> [validator.__name__ for validator in validators]
            ['_valid_age', '_valid_extension', '_valid_language']
        """
        return [
            getattr(self, name)
            for name in dir(self)
            if name.startswith('_valid_')
        ]

    def validate(self, items: Iterator[TItem]) -> Iterator[Tuple[TFlag, TItem]]:
        """
        Validate input against the instance's validators.
        
        For each provided item we run all validators to build a complete 
        picture of any problems with the item. Once all validators have
        ran we output the item as is, but with any statuses that the
        validators have assigned to it.
        
        For example the test below shows that 123_it.txt has both an
        invalid language and file type as it's flag is FileState.INVALID.

            >>> classifier = Classifier(\
                ext=EXT,\
                languages=LANGUAGES,\
                time_limit=time.time() - (MAX_AGE * DAYS),\
            )
            >>> list(classifier.validate([pathlib.Path('./lib/IFM/123_it.txt')]))
            [(<FileState.INVALID: 3>, WindowsPath('lib/IFM/123_it.txt'))]
        """
        functions = self._get_validators()
        for item in items:
            ret = self.DEFAULT
            for function in functions:
                output = function(item)
                if output is not None:
                    ret |= output
            yield ret, item


class FileState(enum.IntFlag):
    """States that files can be in."""
    CORRECT = 0
    EXTENSION = enum.auto()
    LANGUAGE = enum.auto()
    OUT_OF_DATE = enum.auto()
    
    # A convenience for your old categories.
    # only used in the backward-compatible `classify_paths`
    INVALID = EXTENSION | LANGUAGE


class Classifier(BaseClassifier[pathlib.Path, FileState]):
    """Classifiers for files."""
    __slots__ = ('ext', 'languages', 'time_limit')
    DEFAULT: ClassVar[FileState] = FileState.CORRECT
    
    ext: str
    languages: Set[str]
    time_limit: int

    def _valid_extension(self, path: pathlib.Path) -> Optional[FileState]:
        """Validate if path has correct extension."""
        if path.suffix != self.ext:
            return FileState.EXTENSION

    def _valid_language(self, path: pathlib.Path) -> Optional[FileState]:
        """Validate if path has correct language."""
        if path.stem[-3:] not in self.languages:
            return FileState.LANGUAGE

    def _valid_age(self, path: pathlib.Path) -> Optional[FileState]:
        """Validate if file is out of date."""
        if path.stat().st_mtime < self.time_limit:
            return FileState.OUT_OF_DATE


# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(
    classifier: BaseClassifier,
    paths: Iterator[pathlib.Path],
) -> Tuple[Dict[FileState, List[pathlib.Path]], int]:
    """Group classifications."""
    report: Dict[FileState, List[pathlib.Path]] = {}
    for count, (state, path) in enumerate(classifier.validate(paths)):
        report.setdefault(state, []).append(path)
    return report, count


# Complex as I make the output exactly the same as the question.
def classify_paths(
    classifier: BaseClassifier,
    paths: Iterator[pathlib.Path],
) -> Tuple[Dict[str, List[pathlib.Path]], int]:
    """Group classifications into generic custom types."""
    report: Dict[str, List[pathlib.Path]] = {
        'Correct': [],
        'Invalid': [],
        'OutOfDate': [],
    }
    for count, (state, path) in enumerate(classifier.validate(paths)):
        if state == FileState.CORRECT:
            key = 'Correct'
        elif state & FileState.INVALID:
            key = 'Invalid'
        else:
            key = 'OutOfDate'
        report[key].append(path)
    return report, count


def main():
    classifier = Classifier(
        ext=EXT,
        languages=LANGUAGES,
        time_limit=time.time() - (MAX_AGE * DAYS),
    )
    for supplier in SUPPLIERS:
        logging.debug(supplier)
        # Also use classify_paths_simple to contrast the outputs
        result, count = classify_paths(
            classifier,
            (PATH_TO_FILES / supplier).iterdir()
        )
        pprint.pprint(result)
        logging.info("%s files checked", count)

if __name__ == '__main__':
    main()

    import doctest
    doctest.testmod()

Source Link

answered Jan 21, 2020 at 13:35

Peilonrayz ♦

44.6k
7
80
158

Your code is small, but there's a lot going on that I find it hard to understand. Given that this is just the beginning, the complexity of the code will just increase as time goes on.
I found the large amount of globals to hinder readability, as each time I came across another I had to scroll to the top of the code to find out what it is.

Whilst I prefer not to have globals and hide them away in classes. I think there are two options when using a class:
- Make a Classifier class that just has those globals defined on the class as class variables.
- Make a Classifier class that is provided the values at instantiation. And utilize typing to convey what the values contain.
Whilst I find a class helps reduce complexity. It should also increase general maintainability and readability. Currently your function is a bug/feature hotspot, and when you add more classifiers to the function the complexity will only increase.

Splitting the different validation checks into their own methods, or functions, helps break down the function and make the code small palatable chunks. And so increases readability.

It also increases maintainability as when you need to add additional functionality to the function, you instead have a small self-contained function. Which is simpler to re-learn, when coming back to the function after some time. In the changes I made the validation checks each became a method containing a single if and return.

I'm currently not automatically iterating over all suppliers because for some different standards apply.

This also increases customization, as you can easily define multiple classes with the different validation checks. Utilizing either inheritance or mixins to simplify the code.
Since the Classifier class will have lots of functions that will be called in the same way, I would opt to make a BaseClassifier class that magically handles everything.
- Since I would pass values in at instantiation I would define the __init__ and simply assign the value to the instance.
- Since we'll have lots of functions, to reduce the amount of WET I would automagically find the functions to run using dir and getattr.
- Since the functions will all be called in the same way, defining a simple validate function can reduce the amount of code to write on each classifier.
I personally am not a fan of magic strings for your categories.

Rather than magic strings you can use enum.Enum or enum.IntFlag to better categorize your paths.
- enum.Enum can be used exactly as you are now. This means that you can only have one error per path.
- enum.IntFlag allows you to report multiple errors at the same time.
  
  I have a personal distaste for programs that hide errors from me. It means that you have to run the checker multiple times fixing the previous errors to only then get more. Rather than stating all the problems and fixing everything in one swoop.
os.path is largely obsolete by pathlib. Changing your code to use pathlib increases readability of the validators as you can easily validate if the extension is correct using:
```
if path.suffix == EXT:
```
It also allows you to easily join paths using path / str.
I would suggest using logging rather than print. This sets up your application to have correct and easily togglable debugging output. It also means that this can be turned off when in production when not debugging.
I would prefer to have WHITELIST be a set. I see no need for it to be a list.

Now, my solution is hands down more complex. It's adding enum, a Classifier class, a BaseClassifier class and is a significant increase in lines. And so it is susceptible to be an increase in maintainability costs. However in my opinion the benefits that I've described above outweigh the additional costs.

Currently both result and count are used for manual reference, later the result will be passed to the next function.

Given that you've not shown how the results are being used I have provided two classify_files functions. One that outputs the same as your currently expecting. And one that output all the paths grouped by all their errors.

import os
import pprint
import time
import logging
import pathlib

from typing import Set


MAX_AGE = 90

# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS

PATH_TO_FILES = pathlib.Path("path/to/local/copy/of/lib/")
EXT = ".pdf"
WHITELIST = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']


class BaseClassifier:
    """
    Classifier metaclass to ease classifying values.

    This is a fairly magical class that allows setting the required state.
    It also allows validating against all validators starting with `_valid_`,
    without having to manually call each of them.
    """
    DEFAULT = 0

    def __init__(self, **kwargs):
        """
        Initialize classifier state.

        Values must be passed as keywords so they can be set correctly on the instance.
        """
        for name, value in kwargs.items():
            setattr(self, name, value)

    def _get_validators(self):
        """Get all validators on the instance."""
        return [
            getattr(self, name)
            for name in dir(self)
            if name.startswith('_valid_')
        ]

    def validate(self, items):
        """Validate input against the instance's validators."""
        functions = self._get_validators()
        for item in items:
            ret = self.DEFAULT
            for function in functions:
                output = function(item)
                if output is not None:
                    ret |= output
            yield ret, item


class FileState(enum.IntFlag):
    """States that files can be in."""
    CORRECT = 0
    EXTENSION = enum.auto()
    LANGUAGE = enum.auto()
    OUT_OF_DATE = enum.auto()
    
    # A convenience for your old categories.
    # only used in the backward-compatible `classify_paths`
    INVALID = EXTENSION | LANGUAGE


class Classifier(BaseClassifier):
    """Classifiers for files."""
    __slots__ = ('ext', 'languages', 'time_limit')
    DEFAULT = FileState.CORRECT
    
    ext: str
    languages: Set[str]
    time_limit: int

    def _valid_extension(self, path):
        """Validate if path has correct extension."""
        if path.suffix != self.ext:
            return FileState.EXTENSION

    def _valid_language(self, path):
        """Validate if path has correct language."""
        if path.stem[-3:] not in self.languages:
            return FileState.LANGUAGE

    def _valid_age(self, path):
        """Validate if file is out of date."""
        if path.stat().st_mtime < self.time_limit:
            return FileState.OUT_OF_DATE


# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(classifier, paths):
    """Group classifications."""
    report = {}
    for count, (state, path) in enumerate(classifier.validate(paths)):
        report.setdefault(state, []).append(path)
    return report, count


# Complex as I make the output exactly the same as the question.
def classify_paths(classifier, paths):
    """Group classifications into generic custom types."""
    report = {
        'Correct': [],
        'Invalid': [],
        'OutOfDate': [],
    }
    for count, (state, path) in enumerate(classifier.validate(paths)):
        if state == FileState.CORRECT:
            key = 'Correct'
        elif state & FileState.INVALID:
            key = 'Invalid'
        else:
            key = 'OutOfDate'
        report[key].append(path)
    return report, count


def main():
    classifier = Classifier(
        ext=EXT,
        languages=LANGUAGES,
        time_limit=time.time() - limit,
    )
    for supplier in SUPPLIERS:
        logging.debug(supplier)
        result, count = classify_paths(
            classifier,
            (PATH_TO_FILES / supplier).iterdir()
        )
        pprint.pprint(result)
        logging.info("%s files checked", count)

if __name__ == '__main__':
    main()