I am trying to write code to give me BBFC film ratings. I am using selenium to do this but would be happy with any solution that works reliably. After a lot of work I finally came up with this code:
# pip install -U selenium
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import json, re, time
from urllib.parse import quote_plus, urlparse
RATING_RE = re.compile(r"\b(R18|18|15|12A|12|PG|U)\b", re.I)
# -------------------- Browser --------------------
def start_driver(headless=True):
opts = webdriver.FirefoxOptions()
if headless:
opts.add_argument("--headless")
opts.page_load_strategy = "eager" # faster: don't wait for images/fonts
opts.set_preference("permissions.default.image", 2)
# helps headless behave consistently:
opts.set_preference("general.useragent.override",
"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0")
return webdriver.Firefox(service=FirefoxService(), options=opts)
def accept_cookies(driver):
for sel in (
"#onetrust-accept-btn-handler",
"button#onetrust-accept-btn-handler",
"//button[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'accept')]",
"//button[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'agree')]",
):
try:
target = (By.XPATH, sel) if sel.startswith("//") else (By.CSS_SELECTOR, sel)
WebDriverWait(driver, 3).until(EC.element_to_be_clickable(target)).click()
time.sleep(0.15)
return
except Exception:
pass
# -------------------- Page parsing --------------------
def _jsonld_nodes(driver):
out = []
for s in driver.find_elements(By.CSS_SELECTOR, "script[type='application/ld+json']"):
try:
obj = json.loads(s.get_attribute("textContent") or "")
out.extend(obj if isinstance(obj, list) else [obj])
except Exception:
pass
return out
def page_title(driver):
for n in _jsonld_nodes(driver):
if isinstance(n, dict):
nm = n.get("name")
if isinstance(nm, str) and nm.strip():
return nm.strip()
for k in ("itemReviewed", "about", "workExample"):
sub = n.get(k)
if isinstance(sub, dict):
nm = sub.get("name")
if isinstance(nm, str) and nm.strip():
return nm.strip()
# og:title or document.title
try:
metas = driver.find_elements(By.CSS_SELECTOR, "meta[property='og:title'], meta[name='og:title']")
if metas:
t = (metas[0].get_attribute("content") or "").strip()
t = re.sub(r"\s*\|\s*BBFC.*$", "", t)
if t: return t
except Exception:
pass
try:
t = (driver.execute_script("return document.title") or "").strip()
t = re.sub(r"\s*\|\s*BBFC.*$", "", t)
if t: return t
except Exception:
pass
try:
t = (driver.find_element(By.TAG_NAME, "h1").text or "").strip()
if t: return t
except Exception:
pass
return None
def page_year(driver):
for n in _jsonld_nodes(driver):
if not isinstance(n, dict): continue
for key in ("datePublished", "dateCreated", "releaseDate"):
val = n.get(key)
if isinstance(val, str):
m = re.search(r"\b(19|20)\d{2}\b", val)
if m: return int(m.group(0))
sub = n.get("releasedEvent")
if isinstance(sub, dict):
for key in ("startDate","endDate","date"):
val = sub.get(key)
if isinstance(val, str):
m = re.search(r"\b(19|20)\d{2}\b", val)
if m: return int(m.group(0))
# labelled dd fallback
try:
root = driver.find_element(By.TAG_NAME, "main")
except Exception:
root = driver.find_element(By.TAG_NAME, "body")
try:
dd = root.find_element(
By.XPATH,
"//dt[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'release date')"
" or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'classification date')"
" or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'date')]/following-sibling::*[1]"
)
m = re.search(r"\b(19|20)\d{2}\b", (dd.text or dd.get_attribute("textContent") or ""))
if m: return int(m.group(0))
except Exception:
pass
return None
def page_rating(driver):
# JSON-LD first
for n in _jsonld_nodes(driver):
if isinstance(n, dict):
val = n.get("contentRating")
if isinstance(val, str):
m = RATING_RE.search(val)
if m: return m.group(1).upper()
for k in ("itemReviewed", "workExample", "about"):
sub = n.get(k)
if isinstance(sub, dict):
val = sub.get("contentRating")
if isinstance(val, str):
m = RATING_RE.search(val)
if m: return m.group(1).upper()
# labelled fallback
try:
root = driver.find_element(By.TAG_NAME, "main")
except Exception:
root = driver.find_element(By.TAG_NAME, "body")
try:
dd = root.find_element(
By.XPATH,
"//dt[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'age rating')"
" or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'classification')"
" or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'certificate')]/following-sibling::*[1]"
)
m = RATING_RE.search(dd.text or dd.get_attribute("textContent") or "")
if m: return m.group(1).upper()
except Exception:
pass
# proximity fallback (badge alt/aria/etc.)
for el in root.find_elements(By.CSS_SELECTOR, "img[alt], [aria-label], [data-rating], [class*='rating' i]"):
bits = " ".join([
el.get_attribute("alt") or "",
el.get_attribute("aria-label") or "",
el.get_attribute("data-rating") or "",
el.get_attribute("title") or "",
el.text or "",
])
m = RATING_RE.search(bits)
if m: return m.group(1).upper()
return None
def follow_classification_link(driver):
try:
root = driver.find_element(By.TAG_NAME, "main")
except Exception:
root = driver.find_element(By.TAG_NAME, "body")
try:
link = root.find_element(By.CSS_SELECTOR, "a[href*='/classification/']")
return link.get_attribute("href")
except Exception:
return None
# -------------------- Exact-title filtering helpers --------------------
def _norm_title(s: str) -> str:
return re.sub(r"\s+", " ", re.sub(r"[^\w]+", " ", (s or "")).strip()).lower()
def _title_from_card_text(text: str) -> str:
# e.g. "The Matrix (1999)\nContains ..." -> "The Matrix"
if not text: return ""
first_line = text.splitlines()[0]
m = re.match(r"\s*([^(]+?)\s*(?:\(|$)", first_line)
return (m.group(1).strip() if m else first_line.strip())
def _title_from_slug(url: str) -> str:
# https://www.bbfc.co.uk/release/the-matrix-q29... -> "the matrix"
slug = urlparse(url).path.split("/")[-1]
slug = slug.split("-q29", 1)[0]
return re.sub(r"[-_]+", " ", slug).strip()
# -------------------- BBFC search (with exact filter) --------------------
def _click_load_more_if_any(driver):
for _ in range(3):
try:
btn = driver.find_element(
By.XPATH,
"//button[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'load more') or contains(., 'More results')]"
)
driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
btn.click()
time.sleep(0.8)
except Exception:
break
def _collect_search_links(driver, title, year_hint=None, limit=40, exact_title=True):
want = _norm_title(title)
links = []
for url in (
f"https://www.bbfc.co.uk/search?q={quote_plus(title)}",
f"https://www.bbfc.co.uk/search?term={quote_plus(title)}",
):
try:
driver.get(url)
accept_cookies(driver)
WebDriverWait(driver, 18).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
time.sleep(0.7)
_click_load_more_if_any(driver)
for _ in range(2):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(0.8)
anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='/classification/'], a[href*='/release/']")
for a in anchors:
href = a.get_attribute("href") or ""
text = (a.text or "")
if not href: continue
if not ("/classification/" in href or "/release/" in href): continue
if exact_title:
# require exact match by card title OR URL slug
card_title = _title_from_card_text(text)
ok_text = _norm_title(card_title) == want
ok_slug = _norm_title(_title_from_slug(href)) == want
if not (ok_text or ok_slug):
continue
links.append((href, text))
if links:
break
except Exception:
continue
if not links:
return []
# prefer classification first; then higher score (title overlap/year hint)
def _score(text):
s = 0
if _norm_title(_title_from_card_text(text)) == want: s += 5
if year_hint and str(year_hint) in (text or ""): s += 3
return s
links.sort(key=lambda pair: ("/classification/" not in pair[0], -_score(pair[1]), pair[0]))
# dedupe by href and cap
seen, out = set(), []
for href, txt in links:
if href in seen: continue
out.append((href, txt)); seen.add(href)
if len(out) >= limit: break
return out
# -------------------- Matching & main lookup --------------------
def _title_matches(requested_title, page_t, url, exact_title):
if exact_title:
# page title OR slug must equal requested
want = _norm_title(requested_title)
if page_t and _norm_title(page_t) == want:
return True
return _norm_title(_title_from_slug(url)) == want
else:
# relaxed subset match
if page_t:
rt = _norm_title(requested_title)
pt = _norm_title(page_t)
return pt == rt or pt.startswith(rt) or set(rt.split()).issubset(set(pt.split()))
return set(_norm_title(requested_title).split()).issubset(
set(_norm_title(_title_from_slug(url)).split())
)
def find_best_rating(driver, title, specified_year=None, exact_title=True):
"""
If specified_year is None: return rating for the MOST RECENT UK year.
exact_title=True opens only exact-title matches (much faster for names like 'The Matrix').
Returns: (rating_or_None, url_or_None, resolved_title_or_None, resolved_year_or_None)
"""
candidates = _collect_search_links(driver, title, year_hint=specified_year, exact_title=exact_title)
if not candidates:
return None, None, None, None
found = [] # (year or 0, prefer_classification, rating_or_None, url, resolved_title)
for href, card_text in candidates:
try:
driver.get(href)
WebDriverWait(driver, 18).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
accept_cookies(driver)
pt = page_title(driver) or _title_from_slug(href).title()
py = page_year(driver)
if not py:
# fallback: get year from card text "(1999)"
m = re.search(r"\b(19|20)\d{2}\b", card_text or "")
py = int(m.group(0)) if m else None
if not _title_matches(title, pt, href, exact_title):
continue
r = page_rating(driver)
if (not r) and "/release/" in href:
hop = follow_classification_link(driver)
if hop:
driver.get(hop)
WebDriverWait(driver, 12).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
pt = page_title(driver) or pt
py = page_year(driver) or py
r = page_rating(driver)
href = hop
yr = int(py) if isinstance(py, int) else None
prefer = "/classification/" in href
found.append((yr or 0, prefer, r, href, pt))
except Exception:
continue
if not found:
return None, None, None, None
if specified_year is not None:
exact = [x for x in found if x[0] == int(specified_year) and x[2]]
if exact:
exact.sort(key=lambda x: (not x[1], -x[0]))
yr, pref, r, u, pt = exact[0]
return r, u, pt, yr
rated = [x for x in found if x[2]]
if rated:
yr, pref, r, u, pt = max(rated, key=lambda x: (x[0], x[1])) # latest year; prefer classification
return r, u, pt, (yr or None)
yr, pref, r, u, pt = max(found, key=lambda x: (x[0], x[1]))
return None, u, pt, (yr or None)
# -------------------- Convenience wrapper --------------------
def lookup_bbfc_rating(title, year=None, headless=True, exact_title=True):
drv = start_driver(headless=headless)
try:
return find_best_rating(drv, title, specified_year=year, exact_title=exact_title)
finally:
drv.quit()
# -------------------- Interactive CLI --------------------
if __name__ == "__main__":
drv = start_driver(headless=False) # set False to watch it run
try:
while True:
title = input("\nFilm title (blank to quit): ").strip()
if not title:
break
ytxt = input("UK classification year (optional, press Enter to skip): ").strip()
year = int(ytxt) if ytxt.isdigit() else None
rating, url, resolved_title, resolved_year = find_best_rating(
drv, title, specified_year=year, exact_title=True # << exact-only
)
if rating:
if resolved_year:
print(f"→ {resolved_title} ({resolved_year}) BBFC rating: {rating}\n {url}")
else:
print(f"→ {resolved_title} BBFC rating: {rating}\n {url}")
else:
if url:
print(f"→ Could not find a rating, but found a page:\n {resolved_title} ({resolved_year or 'year ?'})\n {url}")
else:
print("→ No matching BBFC page found.")
finally:
drv.quit()
The code seems much longer than necessary and it is very slow.
Is there a shorter and faster solution that works reliably?