-4

I am trying to write code to give me BBFC film ratings. I am using selenium to do this but would be happy with any solution that works reliably. After a lot of work I finally came up with this code:

# pip install -U selenium
from selenium import webdriver
from selenium.webdriver.firefox.service import Service as FirefoxService
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import json, re, time
from urllib.parse import quote_plus, urlparse

RATING_RE = re.compile(r"\b(R18|18|15|12A|12|PG|U)\b", re.I)

# -------------------- Browser --------------------
def start_driver(headless=True):
    opts = webdriver.FirefoxOptions()
    if headless:
        opts.add_argument("--headless")
    opts.page_load_strategy = "eager"                 # faster: don't wait for images/fonts
    opts.set_preference("permissions.default.image", 2)
    # helps headless behave consistently:
    opts.set_preference("general.useragent.override",
                        "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0")
    return webdriver.Firefox(service=FirefoxService(), options=opts)

def accept_cookies(driver):
    for sel in (
        "#onetrust-accept-btn-handler",
        "button#onetrust-accept-btn-handler",
        "//button[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'accept')]",
        "//button[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'agree')]",
    ):
        try:
            target = (By.XPATH, sel) if sel.startswith("//") else (By.CSS_SELECTOR, sel)
            WebDriverWait(driver, 3).until(EC.element_to_be_clickable(target)).click()
            time.sleep(0.15)
            return
        except Exception:
            pass

# -------------------- Page parsing --------------------
def _jsonld_nodes(driver):
    out = []
    for s in driver.find_elements(By.CSS_SELECTOR, "script[type='application/ld+json']"):
        try:
            obj = json.loads(s.get_attribute("textContent") or "")
            out.extend(obj if isinstance(obj, list) else [obj])
        except Exception:
            pass
    return out

def page_title(driver):
    for n in _jsonld_nodes(driver):
        if isinstance(n, dict):
            nm = n.get("name")
            if isinstance(nm, str) and nm.strip():
                return nm.strip()
            for k in ("itemReviewed", "about", "workExample"):
                sub = n.get(k)
                if isinstance(sub, dict):
                    nm = sub.get("name")
                    if isinstance(nm, str) and nm.strip():
                        return nm.strip()
    # og:title or document.title
    try:
        metas = driver.find_elements(By.CSS_SELECTOR, "meta[property='og:title'], meta[name='og:title']")
        if metas:
            t = (metas[0].get_attribute("content") or "").strip()
            t = re.sub(r"\s*\|\s*BBFC.*$", "", t)
            if t: return t
    except Exception:
        pass
    try:
        t = (driver.execute_script("return document.title") or "").strip()
        t = re.sub(r"\s*\|\s*BBFC.*$", "", t)
        if t: return t
    except Exception:
        pass
    try:
        t = (driver.find_element(By.TAG_NAME, "h1").text or "").strip()
        if t: return t
    except Exception:
        pass
    return None

def page_year(driver):
    for n in _jsonld_nodes(driver):
        if not isinstance(n, dict): continue
        for key in ("datePublished", "dateCreated", "releaseDate"):
            val = n.get(key)
            if isinstance(val, str):
                m = re.search(r"\b(19|20)\d{2}\b", val)
                if m: return int(m.group(0))
        sub = n.get("releasedEvent")
        if isinstance(sub, dict):
            for key in ("startDate","endDate","date"):
                val = sub.get(key)
                if isinstance(val, str):
                    m = re.search(r"\b(19|20)\d{2}\b", val)
                    if m: return int(m.group(0))
    # labelled dd fallback
    try:
        root = driver.find_element(By.TAG_NAME, "main")
    except Exception:
        root = driver.find_element(By.TAG_NAME, "body")
    try:
        dd = root.find_element(
            By.XPATH,
            "//dt[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'release date')"
            " or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'classification date')"
            " or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'date')]/following-sibling::*[1]"
        )
        m = re.search(r"\b(19|20)\d{2}\b", (dd.text or dd.get_attribute("textContent") or ""))
        if m: return int(m.group(0))
    except Exception:
        pass
    return None

def page_rating(driver):
    # JSON-LD first
    for n in _jsonld_nodes(driver):
        if isinstance(n, dict):
            val = n.get("contentRating")
            if isinstance(val, str):
                m = RATING_RE.search(val)
                if m: return m.group(1).upper()
            for k in ("itemReviewed", "workExample", "about"):
                sub = n.get(k)
                if isinstance(sub, dict):
                    val = sub.get("contentRating")
                    if isinstance(val, str):
                        m = RATING_RE.search(val)
                        if m: return m.group(1).upper()
    # labelled fallback
    try:
        root = driver.find_element(By.TAG_NAME, "main")
    except Exception:
        root = driver.find_element(By.TAG_NAME, "body")
    try:
        dd = root.find_element(
            By.XPATH,
            "//dt[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'age rating')"
            " or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'classification')"
            " or contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'certificate')]/following-sibling::*[1]"
        )
        m = RATING_RE.search(dd.text or dd.get_attribute("textContent") or "")
        if m: return m.group(1).upper()
    except Exception:
        pass
    # proximity fallback (badge alt/aria/etc.)
    for el in root.find_elements(By.CSS_SELECTOR, "img[alt], [aria-label], [data-rating], [class*='rating' i]"):
        bits = " ".join([
            el.get_attribute("alt") or "",
            el.get_attribute("aria-label") or "",
            el.get_attribute("data-rating") or "",
            el.get_attribute("title") or "",
            el.text or "",
        ])
        m = RATING_RE.search(bits)
        if m: return m.group(1).upper()
    return None

def follow_classification_link(driver):
    try:
        root = driver.find_element(By.TAG_NAME, "main")
    except Exception:
        root = driver.find_element(By.TAG_NAME, "body")
    try:
        link = root.find_element(By.CSS_SELECTOR, "a[href*='/classification/']")
        return link.get_attribute("href")
    except Exception:
        return None

# -------------------- Exact-title filtering helpers --------------------
def _norm_title(s: str) -> str:
    return re.sub(r"\s+", " ", re.sub(r"[^\w]+", " ", (s or "")).strip()).lower()

def _title_from_card_text(text: str) -> str:
    # e.g. "The Matrix (1999)\nContains ..." -> "The Matrix"
    if not text: return ""
    first_line = text.splitlines()[0]
    m = re.match(r"\s*([^(]+?)\s*(?:\(|$)", first_line)
    return (m.group(1).strip() if m else first_line.strip())

def _title_from_slug(url: str) -> str:
    # https://www.bbfc.co.uk/release/the-matrix-q29... -> "the matrix"
    slug = urlparse(url).path.split("/")[-1]
    slug = slug.split("-q29", 1)[0]
    return re.sub(r"[-_]+", " ", slug).strip()

# -------------------- BBFC search (with exact filter) --------------------
def _click_load_more_if_any(driver):
    for _ in range(3):
        try:
            btn = driver.find_element(
                By.XPATH,
                "//button[contains(translate(.,'ABCDEFGHIJKLMNOPQRSTUVWXYZ','abcdefghijklmnopqrstuvwxyz'),'load more') or contains(., 'More results')]"
            )
            driver.execute_script("arguments[0].scrollIntoView({block:'center'});", btn)
            btn.click()
            time.sleep(0.8)
        except Exception:
            break

def _collect_search_links(driver, title, year_hint=None, limit=40, exact_title=True):
    want = _norm_title(title)
    links = []
    for url in (
        f"https://www.bbfc.co.uk/search?q={quote_plus(title)}",
        f"https://www.bbfc.co.uk/search?term={quote_plus(title)}",
    ):
        try:
            driver.get(url)
            accept_cookies(driver)
            WebDriverWait(driver, 18).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            time.sleep(0.7)
            _click_load_more_if_any(driver)
            for _ in range(2):
                driver.execute_script("window.scrollTo(0, document.body.scrollHeight);"); time.sleep(0.8)

            anchors = driver.find_elements(By.CSS_SELECTOR, "a[href*='/classification/'], a[href*='/release/']")
            for a in anchors:
                href = a.get_attribute("href") or ""
                text = (a.text or "")
                if not href: continue
                if not ("/classification/" in href or "/release/" in href): continue

                if exact_title:
                    # require exact match by card title OR URL slug
                    card_title = _title_from_card_text(text)
                    ok_text = _norm_title(card_title) == want
                    ok_slug = _norm_title(_title_from_slug(href)) == want
                    if not (ok_text or ok_slug):
                        continue

                links.append((href, text))
            if links:
                break
        except Exception:
            continue

    if not links:
        return []

    # prefer classification first; then higher score (title overlap/year hint)
    def _score(text):
        s = 0
        if _norm_title(_title_from_card_text(text)) == want: s += 5
        if year_hint and str(year_hint) in (text or ""): s += 3
        return s

    links.sort(key=lambda pair: ("/classification/" not in pair[0], -_score(pair[1]), pair[0]))
    # dedupe by href and cap
    seen, out = set(), []
    for href, txt in links:
        if href in seen: continue
        out.append((href, txt)); seen.add(href)
        if len(out) >= limit: break
    return out

# -------------------- Matching & main lookup --------------------
def _title_matches(requested_title, page_t, url, exact_title):
    if exact_title:
        # page title OR slug must equal requested
        want = _norm_title(requested_title)
        if page_t and _norm_title(page_t) == want:
            return True
        return _norm_title(_title_from_slug(url)) == want
    else:
        # relaxed subset match
        if page_t:
            rt = _norm_title(requested_title)
            pt = _norm_title(page_t)
            return pt == rt or pt.startswith(rt) or set(rt.split()).issubset(set(pt.split()))
        return set(_norm_title(requested_title).split()).issubset(
            set(_norm_title(_title_from_slug(url)).split())
        )

def find_best_rating(driver, title, specified_year=None, exact_title=True):
    """
    If specified_year is None: return rating for the MOST RECENT UK year.
    exact_title=True opens only exact-title matches (much faster for names like 'The Matrix').
    Returns: (rating_or_None, url_or_None, resolved_title_or_None, resolved_year_or_None)
    """
    candidates = _collect_search_links(driver, title, year_hint=specified_year, exact_title=exact_title)
    if not candidates:
        return None, None, None, None

    found = []  # (year or 0, prefer_classification, rating_or_None, url, resolved_title)

    for href, card_text in candidates:
        try:
            driver.get(href)
            WebDriverWait(driver, 18).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
            accept_cookies(driver)

            pt = page_title(driver) or _title_from_slug(href).title()
            py = page_year(driver)
            if not py:
                # fallback: get year from card text "(1999)"
                m = re.search(r"\b(19|20)\d{2}\b", card_text or "")
                py = int(m.group(0)) if m else None

            if not _title_matches(title, pt, href, exact_title):
                continue

            r = page_rating(driver)
            if (not r) and "/release/" in href:
                hop = follow_classification_link(driver)
                if hop:
                    driver.get(hop)
                    WebDriverWait(driver, 12).until(EC.presence_of_element_located((By.TAG_NAME, "body")))
                    pt = page_title(driver) or pt
                    py = page_year(driver) or py
                    r = page_rating(driver)
                    href = hop

            yr = int(py) if isinstance(py, int) else None
            prefer = "/classification/" in href
            found.append((yr or 0, prefer, r, href, pt))
        except Exception:
            continue

    if not found:
        return None, None, None, None

    if specified_year is not None:
        exact = [x for x in found if x[0] == int(specified_year) and x[2]]
        if exact:
            exact.sort(key=lambda x: (not x[1], -x[0]))
            yr, pref, r, u, pt = exact[0]
            return r, u, pt, yr

    rated = [x for x in found if x[2]]
    if rated:
        yr, pref, r, u, pt = max(rated, key=lambda x: (x[0], x[1]))  # latest year; prefer classification
        return r, u, pt, (yr or None)

    yr, pref, r, u, pt = max(found, key=lambda x: (x[0], x[1]))
    return None, u, pt, (yr or None)

# -------------------- Convenience wrapper --------------------
def lookup_bbfc_rating(title, year=None, headless=True, exact_title=True):
    drv = start_driver(headless=headless)
    try:
        return find_best_rating(drv, title, specified_year=year, exact_title=exact_title)
    finally:
        drv.quit()

# -------------------- Interactive CLI --------------------
if __name__ == "__main__":
    drv = start_driver(headless=False)   # set False to watch it run
    try:
        while True:
            title = input("\nFilm title (blank to quit): ").strip()
            if not title:
                break
            ytxt = input("UK classification year (optional, press Enter to skip): ").strip()
            year = int(ytxt) if ytxt.isdigit() else None

            rating, url, resolved_title, resolved_year = find_best_rating(
                drv, title, specified_year=year, exact_title=True  # << exact-only
            )
            if rating:
                if resolved_year:
                    print(f"→ {resolved_title} ({resolved_year}) BBFC rating: {rating}\n   {url}")
                else:
                    print(f"→ {resolved_title} BBFC rating: {rating}\n   {url}")
            else:
                if url:
                    print(f"→ Could not find a rating, but found a page:\n   {resolved_title} ({resolved_year or 'year ?'})\n   {url}")
                else:
                    print("→ No matching BBFC page found.")
    finally:
        drv.quit()

The code seems much longer than necessary and it is very slow.

Is there a shorter and faster solution that works reliably?

0

2 Answers 2

1

Using Selenium and regex parsing is too much for this purpose. Often times the website exposes the backend API it uses so you can skip loading the frontend entirely. By looking on the network tab on chrome tools, I can see that it fetches data from a graphql API.

So just copy the query structure to fetch the data, avoiding all the hassle you had to go through. The code would look like something like this:

import requests

# found it by listening to requests on chrome tools
endpoint = "https://www.bbfc.co.uk/graphql"

# copied the request structure from chrome tools as well
query = """
query Autocomplete($url: String!, $searchTerm: String!) {
  autocomplete(url: $url, searchTerm: $searchTerm) {
    results {
      title
      classification
      releaseDate
    }
  }
}
"""
search_term = input("Enter search term: ")
variables = {"url": "https://www.bbfc.co.uk/", "searchTerm": search_term}

# query graphql and not the frontend
r = requests.post(endpoint, json={"query": query, "variables": variables})
data = r.json()

results = data.get("data", {}).get("autocomplete", {}).get("results", [])

# Pick the first or top result
if results:
    top_result = results[0]
    print("Top Result:")
    print(f"Title: {top_result['title']}")
    print(f"Classification: {top_result['classification']}")
    print(f"Release Date: {top_result['releaseDate']}")
else:
    print("No results found.")

This lets you get data directly in the format you want (no html parsing) and should be much much faster since you skip going through the frontend and waiting for elements to load (instead of you get the data directly from the source).

Sign up to request clarification or add additional context in comments.

Comments

1

Here's a much simpler way to do this. You can fancy it up however you like but this is the guts of a really simple and fast way to get the ratings.

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.wait import WebDriverWait

driver = webdriver.Chrome()
driver.maximize_window()
wait = WebDriverWait(driver, 10)

titles = ['Toy Story 4', 'How To Train Your Dragon', 'Predator: Badlands', 'Harry Potter And The Goblet Of Fire', 'The Matrix']
for title in titles:
    url = f'https://www.bbfc.co.uk/search?q={title}'
    driver.get(url)

    a = wait.until(EC.visibility_of_element_located((By.XPATH, f"//a[.//h2[starts-with(text(), '{title} (')]]")))
    rating = a.find_element(By.CSS_SELECTOR, "span").get_attribute("aria-label").split("-")[1].upper()
    actual_title = a.find_element(By.CSS_SELECTOR, "h2").text
    print(f"{actual_title}: {rating}")

and it outputs

Toy Story 4 (2019): U
How To Train Your Dragon (2025): PG
Predator: Badlands (2024): 12A
Harry Potter And The Goblet Of Fire (2005): 12
The Matrix (1999): 15

2 Comments

The difficulty is what to do with "The Matrix" for example. The BBFC search returns all the films that have that within its title. My (terrible) code find the ones with exactly the right name and selects the most recent one.
Fair enough. I've updated it to find the exact movie. All it took is a simple XPath. I added The Matrix to the list of movies as well to prove that it works.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.