REFACTOR based on feedback:
@dataclass
class SEOCheckResult:
error: bool
description: str
solution: Optional[str] = None
code: Optional[str] = None
@dataclass
class SEOChecks:
category: str
category_results: List[SEOCheckResult] = field(default_factory=list)
@dataclass(eq=False)
class PageReport:
html: str
domain: str
url: str
keyword: str
page_results: List[SEOChecks] = field(default_factory=list)
def generate_results(self):
seo_check_functions = [
analyze_h_tag
]
for seo_func in seo_check_functions:
self.page_results.append(seo_func(self.soup, self.keyword))
@property
@lru_cache(maxsize=None)
def soup(self):
return BeautifulSoup(self.html, "lxml")
from .http import http_client
class Crawler:
def __init__(self, url, keyword):
self.conn_error = None
self.url = url if urlsplit(url).scheme else f"http://{url}"
self.parse = urlsplit(self.url)
self.domain = self.parse.netloc
self.keyword = keyword
def _html(self):
try:
response = http_client.get(self.url)
return response.text
except (
requests.exceptions.HTTPError,
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.RequestException,
) as e:
self.conn_error = e
return None
def crawl_page(self):
html = self._html()
if html is None:
return None
page = PageReport(
html=html,
domain=self.domain,
url=self.url,
keyword=self.keyword,
)
page.generate_results()
return page.page_results
class HttpClient:
def __init__(self):
self.session = requests.Session()
def get(self, url):
r = self.session.get(url, timeout=15)
r.raise_for_status()
return r
http_client = HttpClient()