import os
import pprint
import timeenum
import logging
import os
import pathlib
import pprint
import time
from typing import Any, Callable, ClassVar, Dict, Generic, Iterator, List, Optional, Set, Tuple, TypeVar
TItem = TypeVar('TItem')
TFlag = TypeVar('TFlag')
MAX_AGE = 90
# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS
PATH_TO_FILES = pathlib.Path("path/to/local/copy/of/lib/"./lib")
EXT = ".pdf"
WHITELISTLANGUAGES = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']
class BaseClassifier(Generic[TItem, TFlag]):
"""
Classifier metaclass to ease classifying values.
This is a fairly magical class that allows setting the required state.
It also allows validating against all validators starting with `_valid_`,
without having to manually call each of them.
"""
DEFAULT =: 0ClassVar[TFlag]
def __init__(self, **kwargs: Any):
"""
Initialize classifier state.
Values must be passed as keywords so they can be set correctly
on the instance. For each keyword provided an attribute with the
same name is created and has the value set to the keywords value.
For example we can set the attribute `foo` by passing it as a keyword.
>>> bc = BaseClassifier(foo='bar')
>>> bc.foo
'bar'
"""
for name, value in kwargs.items():
setattr(self, name, value)
def _get_validators(self) -> List[Callable[[TItem], Optional[TFlag]]]:
"""Get"""
Get all validators on the instance.
All validators must start `_valid_` and so we loop through all
the names of the attributes on the instance, via `dir`, and
filter to ones that start with `_valid_`. For each name that
starts with `_valid_` we return the value of the attribute.
For example `Classifier` has the following validators.
Note that the test is for the function names, but it actually
returns the full fat function. Just getting the test to work
with the full fat functions is a PITA.
>>> validators = Classifier()._get_validators()
>>> [validator.__name__ for validator in validators]
['_valid_age', '_valid_extension', '_valid_language']
"""
return [
getattr(self, name)
for name in dir(self)
if name.startswith('_valid_')
]
def validate(self, items: Iterator[TItem]) -> Iterator[Tuple[TFlag, TItem]]:
"""Validate"""
Validate input against the instance's validators.
For each provided item we run all validators to build a complete
picture of any problems with the item. Once all validators have
ran we output the item as is, but with any statuses that the
validators have assigned to it.
For example the test below shows that 123_it.txt has both an
invalid language and file type as it's flag is FileState.INVALID.
>>> classifier = Classifier(\
ext=EXT,\
languages=LANGUAGES,\
time_limit=time.time() - (MAX_AGE * DAYS),\
)
>>> list(classifier.validate([pathlib.Path('./lib/IFM/123_it.txt')]))
[(<FileState.INVALID: 3>, WindowsPath('lib/IFM/123_it.txt'))]
"""
functions = self._get_validators()
for item in items:
ret = self.DEFAULT
for function in functions:
output = function(item)
if output is not None:
ret |= output
yield ret, item
class FileState(enum.IntFlag):
"""States that files can be in."""
CORRECT = 0
EXTENSION = enum.auto()
LANGUAGE = enum.auto()
OUT_OF_DATE = enum.auto()
# A convenience for your old categories.
# only used in the backward-compatible `classify_paths`
INVALID = EXTENSION | LANGUAGE
class Classifier(BaseClassifierBaseClassifier[pathlib.Path, FileState]):
"""Classifiers for files."""
__slots__ = ('ext', 'languages', 'time_limit')
DEFAULT: ClassVar[FileState] = FileState.CORRECT
ext: str
languages: Set[str]
time_limit: int
def _valid_extension(self, path: pathlib.Path) -> Optional[FileState]:
"""Validate if path has correct extension."""
if path.suffix != self.ext:
return FileState.EXTENSION
def _valid_language(self, path: pathlib.Path) -> Optional[FileState]:
"""Validate if path has correct language."""
if path.stem[-3:] not in self.languages:
return FileState.LANGUAGE
def _valid_age(self, path: pathlib.Path) -> Optional[FileState]:
"""Validate if file is out of date."""
if path.stat().st_mtime < self.time_limit:
return FileState.OUT_OF_DATE
# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(
classifier: BaseClassifier,
paths: Iterator[pathlib.Path],
) -> Tuple[Dict[FileState, List[pathlib.Path]], int]:
"""Group classifications."""
report: Dict[FileState, List[pathlib.Path]] = {}
for count, (state, path) in enumerate(classifier.validate(paths)):
report.setdefault(state, []).append(path)
return report, count
# Complex as I make the output exactly the same as the question.
def classify_paths(
classifier: BaseClassifier,
paths: Iterator[pathlib.Path],
) -> Tuple[Dict[str, List[pathlib.Path]], int]:
"""Group classifications into generic custom types."""
report: Dict[str, List[pathlib.Path]] = {
'Correct': [],
'Invalid': [],
'OutOfDate': [],
}
for count, (state, path) in enumerate(classifier.validate(paths)):
if state == FileState.CORRECT:
key = 'Correct'
elif state & FileState.INVALID:
key = 'Invalid'
else:
key = 'OutOfDate'
report[key].append(path)
return report, count
def main():
classifier = Classifier(
ext=EXT,
languages=LANGUAGES,
time_limit=time.time() - limit(MAX_AGE * DAYS),
)
for supplier in SUPPLIERS:
logging.debug(supplier)
# Also use classify_paths_simple to contrast the outputs
result, count = classify_paths(
classifier,
(PATH_TO_FILES / supplier).iterdir()
)
pprint.pprint(result)
logging.info("%s files checked", count)
if __name__ == '__main__':
main()
import doctest
doctest.testmod()
import os
import pprint
import time
import logging
import pathlib
from typing import Set
MAX_AGE = 90
# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS
PATH_TO_FILES = pathlib.Path("path/to/local/copy/of/lib/")
EXT = ".pdf"
WHITELIST = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']
class BaseClassifier:
"""
Classifier metaclass to ease classifying values.
This is a fairly magical class that allows setting the required state.
It also allows validating against all validators starting with `_valid_`,
without having to manually call each of them.
"""
DEFAULT = 0
def __init__(self, **kwargs):
"""
Initialize classifier state.
Values must be passed as keywords so they can be set correctly on the instance.
"""
for name, value in kwargs.items():
setattr(self, name, value)
def _get_validators(self):
"""Get all validators on the instance."""
return [
getattr(self, name)
for name in dir(self)
if name.startswith('_valid_')
]
def validate(self, items):
"""Validate input against the instance's validators."""
functions = self._get_validators()
for item in items:
ret = self.DEFAULT
for function in functions:
output = function(item)
if output is not None:
ret |= output
yield ret, item
class FileState(enum.IntFlag):
"""States that files can be in."""
CORRECT = 0
EXTENSION = enum.auto()
LANGUAGE = enum.auto()
OUT_OF_DATE = enum.auto()
# A convenience for your old categories.
# only used in the backward-compatible `classify_paths`
INVALID = EXTENSION | LANGUAGE
class Classifier(BaseClassifier):
"""Classifiers for files."""
__slots__ = ('ext', 'languages', 'time_limit')
DEFAULT = FileState.CORRECT
ext: str
languages: Set[str]
time_limit: int
def _valid_extension(self, path):
"""Validate if path has correct extension."""
if path.suffix != self.ext:
return FileState.EXTENSION
def _valid_language(self, path):
"""Validate if path has correct language."""
if path.stem[-3:] not in self.languages:
return FileState.LANGUAGE
def _valid_age(self, path):
"""Validate if file is out of date."""
if path.stat().st_mtime < self.time_limit:
return FileState.OUT_OF_DATE
# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(classifier, paths):
"""Group classifications."""
report = {}
for count, (state, path) in enumerate(classifier.validate(paths)):
report.setdefault(state, []).append(path)
return report, count
# Complex as I make the output exactly the same as the question.
def classify_paths(classifier, paths):
"""Group classifications into generic custom types."""
report = {
'Correct': [],
'Invalid': [],
'OutOfDate': [],
}
for count, (state, path) in enumerate(classifier.validate(paths)):
if state == FileState.CORRECT:
key = 'Correct'
elif state & FileState.INVALID:
key = 'Invalid'
else:
key = 'OutOfDate'
report[key].append(path)
return report, count
def main():
classifier = Classifier(
ext=EXT,
languages=LANGUAGES,
time_limit=time.time() - limit,
)
for supplier in SUPPLIERS:
logging.debug(supplier)
result, count = classify_paths(
classifier,
(PATH_TO_FILES / supplier).iterdir()
)
pprint.pprint(result)
logging.info("%s files checked", count)
if __name__ == '__main__':
main()
import enum
import logging
import os
import pathlib
import pprint
import time
from typing import Any, Callable, ClassVar, Dict, Generic, Iterator, List, Optional, Set, Tuple, TypeVar
TItem = TypeVar('TItem')
TFlag = TypeVar('TFlag')
MAX_AGE = 90
# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS
PATH_TO_FILES = pathlib.Path("./lib")
EXT = ".pdf"
LANGUAGES = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']
class BaseClassifier(Generic[TItem, TFlag]):
"""
Classifier metaclass to ease classifying values.
This is a fairly magical class that allows setting the required state.
It also allows validating against all validators starting with `_valid_`,
without having to manually call each of them.
"""
DEFAULT: ClassVar[TFlag]
def __init__(self, **kwargs: Any):
"""
Initialize classifier state.
Values must be passed as keywords so they can be set correctly
on the instance. For each keyword provided an attribute with the
same name is created and has the value set to the keywords value.
For example we can set the attribute `foo` by passing it as a keyword.
>>> bc = BaseClassifier(foo='bar')
>>> bc.foo
'bar'
"""
for name, value in kwargs.items():
setattr(self, name, value)
def _get_validators(self) -> List[Callable[[TItem], Optional[TFlag]]]:
"""
Get all validators on the instance.
All validators must start `_valid_` and so we loop through all
the names of the attributes on the instance, via `dir`, and
filter to ones that start with `_valid_`. For each name that
starts with `_valid_` we return the value of the attribute.
For example `Classifier` has the following validators.
Note that the test is for the function names, but it actually
returns the full fat function. Just getting the test to work
with the full fat functions is a PITA.
>>> validators = Classifier()._get_validators()
>>> [validator.__name__ for validator in validators]
['_valid_age', '_valid_extension', '_valid_language']
"""
return [
getattr(self, name)
for name in dir(self)
if name.startswith('_valid_')
]
def validate(self, items: Iterator[TItem]) -> Iterator[Tuple[TFlag, TItem]]:
"""
Validate input against the instance's validators.
For each provided item we run all validators to build a complete
picture of any problems with the item. Once all validators have
ran we output the item as is, but with any statuses that the
validators have assigned to it.
For example the test below shows that 123_it.txt has both an
invalid language and file type as it's flag is FileState.INVALID.
>>> classifier = Classifier(\
ext=EXT,\
languages=LANGUAGES,\
time_limit=time.time() - (MAX_AGE * DAYS),\
)
>>> list(classifier.validate([pathlib.Path('./lib/IFM/123_it.txt')]))
[(<FileState.INVALID: 3>, WindowsPath('lib/IFM/123_it.txt'))]
"""
functions = self._get_validators()
for item in items:
ret = self.DEFAULT
for function in functions:
output = function(item)
if output is not None:
ret |= output
yield ret, item
class FileState(enum.IntFlag):
"""States that files can be in."""
CORRECT = 0
EXTENSION = enum.auto()
LANGUAGE = enum.auto()
OUT_OF_DATE = enum.auto()
# A convenience for your old categories.
# only used in the backward-compatible `classify_paths`
INVALID = EXTENSION | LANGUAGE
class Classifier(BaseClassifier[pathlib.Path, FileState]):
"""Classifiers for files."""
__slots__ = ('ext', 'languages', 'time_limit')
DEFAULT: ClassVar[FileState] = FileState.CORRECT
ext: str
languages: Set[str]
time_limit: int
def _valid_extension(self, path: pathlib.Path) -> Optional[FileState]:
"""Validate if path has correct extension."""
if path.suffix != self.ext:
return FileState.EXTENSION
def _valid_language(self, path: pathlib.Path) -> Optional[FileState]:
"""Validate if path has correct language."""
if path.stem[-3:] not in self.languages:
return FileState.LANGUAGE
def _valid_age(self, path: pathlib.Path) -> Optional[FileState]:
"""Validate if file is out of date."""
if path.stat().st_mtime < self.time_limit:
return FileState.OUT_OF_DATE
# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(
classifier: BaseClassifier,
paths: Iterator[pathlib.Path],
) -> Tuple[Dict[FileState, List[pathlib.Path]], int]:
"""Group classifications."""
report: Dict[FileState, List[pathlib.Path]] = {}
for count, (state, path) in enumerate(classifier.validate(paths)):
report.setdefault(state, []).append(path)
return report, count
# Complex as I make the output exactly the same as the question.
def classify_paths(
classifier: BaseClassifier,
paths: Iterator[pathlib.Path],
) -> Tuple[Dict[str, List[pathlib.Path]], int]:
"""Group classifications into generic custom types."""
report: Dict[str, List[pathlib.Path]] = {
'Correct': [],
'Invalid': [],
'OutOfDate': [],
}
for count, (state, path) in enumerate(classifier.validate(paths)):
if state == FileState.CORRECT:
key = 'Correct'
elif state & FileState.INVALID:
key = 'Invalid'
else:
key = 'OutOfDate'
report[key].append(path)
return report, count
def main():
classifier = Classifier(
ext=EXT,
languages=LANGUAGES,
time_limit=time.time() - (MAX_AGE * DAYS),
)
for supplier in SUPPLIERS:
logging.debug(supplier)
# Also use classify_paths_simple to contrast the outputs
result, count = classify_paths(
classifier,
(PATH_TO_FILES / supplier).iterdir()
)
pprint.pprint(result)
logging.info("%s files checked", count)
if __name__ == '__main__':
main()
import doctest
doctest.testmod()
Your code is small, but there's a lot going on that I find it hard to understand. Given that this is just the beginning, the complexity of the code will just increase as time goes on.
I found the large amount of globals to hinder readability, as each time I came across another I had to scroll to the top of the code to find out what it is.
Whilst I prefer not to have globals and hide them away in classes. I think there are two options when using a class:
- Make a
Classifierclass that just has those globals defined on the class as class variables. - Make a
Classifierclass that is provided the values at instantiation. And utilizetypingto convey what the values contain.
- Make a
Whilst I find a class helps reduce complexity. It should also increase general maintainability and readability. Currently your function is a bug/feature hotspot, and when you add more classifiers to the function the complexity will only increase.
Splitting the different validation checks into their own methods, or functions, helps break down the function and make the code small palatable chunks. And so increases readability.
It also increases maintainability as when you need to add additional functionality to the function, you instead have a small self-contained function. Which is simpler to re-learn, when coming back to the function after some time. In the changes I made the validation checks each became a method containing a single
ifandreturn.I'm currently not automatically iterating over all suppliers because for some different standards apply.
This also increases customization, as you can easily define multiple classes with the different validation checks. Utilizing either inheritance or mixins to simplify the code.
Since the
Classifierclass will have lots of functions that will be called in the same way, I would opt to make aBaseClassifierclass that magically handles everything.- Since I would pass values in at instantiation I would define the
__init__and simply assign the value to the instance. - Since we'll have lots of functions, to reduce the amount of WET I would automagically find the functions to run using
dirandgetattr. - Since the functions will all be called in the same way, defining a simple
validatefunction can reduce the amount of code to write on each classifier.
- Since I would pass values in at instantiation I would define the
I personally am not a fan of magic strings for your categories.
Rather than magic strings you can use
enum.Enumorenum.IntFlagto better categorize your paths.enum.Enumcan be used exactly as you are now. This means that you can only have one error per path.enum.IntFlagallows you to report multiple errors at the same time.I have a personal distaste for programs that hide errors from me. It means that you have to run the checker multiple times fixing the previous errors to only then get more. Rather than stating all the problems and fixing everything in one swoop.
os.pathis largely obsolete bypathlib. Changing your code to usepathlibincreases readability of the validators as you can easily validate if the extension is correct using:if path.suffix == EXT:It also allows you to easily join paths using
path / str.I would suggest using
loggingrather thanprint. This sets up your application to have correct and easily togglable debugging output. It also means that this can be turned off when in production when not debugging.I would prefer to have
WHITELISTbe a set. I see no need for it to be a list.
Now, my solution is hands down more complex. It's adding enum, a Classifier class, a BaseClassifier class and is a significant increase in lines. And so it is susceptible to be an increase in maintainability costs. However in my opinion the benefits that I've described above outweigh the additional costs.
Currently both result and count are used for manual reference, later the result will be passed to the next function.
Given that you've not shown how the results are being used I have provided two classify_files functions. One that outputs the same as your currently expecting. And one that output all the paths grouped by all their errors.
import os
import pprint
import time
import logging
import pathlib
from typing import Set
MAX_AGE = 90
# 60 seconds in a minute, 60 minutes in an hour, 24 hours in a day
MINUTES = 60
HOURS = 60*MINUTES
DAYS = 24*HOURS
PATH_TO_FILES = pathlib.Path("path/to/local/copy/of/lib/")
EXT = ".pdf"
WHITELIST = {'_nl', '_en', '_de'}
SUPPLIERS = ['Siemens', 'IFM']
class BaseClassifier:
"""
Classifier metaclass to ease classifying values.
This is a fairly magical class that allows setting the required state.
It also allows validating against all validators starting with `_valid_`,
without having to manually call each of them.
"""
DEFAULT = 0
def __init__(self, **kwargs):
"""
Initialize classifier state.
Values must be passed as keywords so they can be set correctly on the instance.
"""
for name, value in kwargs.items():
setattr(self, name, value)
def _get_validators(self):
"""Get all validators on the instance."""
return [
getattr(self, name)
for name in dir(self)
if name.startswith('_valid_')
]
def validate(self, items):
"""Validate input against the instance's validators."""
functions = self._get_validators()
for item in items:
ret = self.DEFAULT
for function in functions:
output = function(item)
if output is not None:
ret |= output
yield ret, item
class FileState(enum.IntFlag):
"""States that files can be in."""
CORRECT = 0
EXTENSION = enum.auto()
LANGUAGE = enum.auto()
OUT_OF_DATE = enum.auto()
# A convenience for your old categories.
# only used in the backward-compatible `classify_paths`
INVALID = EXTENSION | LANGUAGE
class Classifier(BaseClassifier):
"""Classifiers for files."""
__slots__ = ('ext', 'languages', 'time_limit')
DEFAULT = FileState.CORRECT
ext: str
languages: Set[str]
time_limit: int
def _valid_extension(self, path):
"""Validate if path has correct extension."""
if path.suffix != self.ext:
return FileState.EXTENSION
def _valid_language(self, path):
"""Validate if path has correct language."""
if path.stem[-3:] not in self.languages:
return FileState.LANGUAGE
def _valid_age(self, path):
"""Validate if file is out of date."""
if path.stat().st_mtime < self.time_limit:
return FileState.OUT_OF_DATE
# Simpler method of grouping categories
# This utilizes the `enum` we defined earlier.
def classify_paths_simple(classifier, paths):
"""Group classifications."""
report = {}
for count, (state, path) in enumerate(classifier.validate(paths)):
report.setdefault(state, []).append(path)
return report, count
# Complex as I make the output exactly the same as the question.
def classify_paths(classifier, paths):
"""Group classifications into generic custom types."""
report = {
'Correct': [],
'Invalid': [],
'OutOfDate': [],
}
for count, (state, path) in enumerate(classifier.validate(paths)):
if state == FileState.CORRECT:
key = 'Correct'
elif state & FileState.INVALID:
key = 'Invalid'
else:
key = 'OutOfDate'
report[key].append(path)
return report, count
def main():
classifier = Classifier(
ext=EXT,
languages=LANGUAGES,
time_limit=time.time() - limit,
)
for supplier in SUPPLIERS:
logging.debug(supplier)
result, count = classify_paths(
classifier,
(PATH_TO_FILES / supplier).iterdir()
)
pprint.pprint(result)
logging.info("%s files checked", count)
if __name__ == '__main__':
main()