1

I'm trying to host my scraper on the cloud using a docker file since I can't install a desktop env on my vps. I can't find any examples for seleniumbase and it must run in headful mode for the cloudflare captcha bypass to work.

I got it to work, but the script randomly freezes, even the shutdown signal handler doesn't work when I press ctrl+c and I noticed that it freezes on anything that has something to do with an element being located on the page.

My dockerfile:

# Use the official Python image
FROM python:3.10-slim

# Set environment variables
ENV PYTHONUNBUFFERED=1
ENV DISPLAY=:99

# Install system dependencies for Chrome and Xvfb
RUN apt-get update && apt-get install -y \
    wget \
    gnupg2 \
    curl \
    unzip \
    libxi6 \
    libgconf-2-4 \
    libnss3 \
    libxss1 \
    fonts-liberation \
    libappindicator3-1 \
    x11-utils \
    xvfb \
    python3-tk \
    python3-dev \
    && apt-get clean \
    && rm -rf /var/lib/apt/lists/*

# Install Google Chrome Stable and fonts
RUN apt-get update && apt-get install curl gnupg -y \
  && curl --location --silent https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - \
  && sh -c 'echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list' \
  && apt-get update \
  && apt-get install google-chrome-stable -y --no-install-recommends \
  && rm -rf /var/lib/apt/lists/*

# Set the working directory
WORKDIR /app
RUN mkdir /app/misc

# Copy the requirements file and install Python dependencies
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

# Copy the rest of your application code
COPY . .

# Start Xvfb and then run the script
CMD ["sh", "-c", "Xvfb :99 -screen 0 1920x1080x24 & python private/main.py"]

The function for scraping (I removed the URL for privacy reasons):

async def fetch_token_hrefs():
    """
    A generator to fetch and yield the first three tokens from front page.
    This function will loop indefinitely to allow refreshing the page for new tokens,
    and restart the driver every 10 minutes.
    """
    driver = None
    last_restart_time = time.time()
    try:
        while True:  # Loop to refresh the page for new tokens
            # Restart the driver every 10 minutes
            current_time = time.time()
            if driver is None or (current_time - last_restart_time) > 300:
                if driver:
                    driver.quit()
                driver = Driver(
                    binary_location="/usr/bin/google-chrome",
                    undetected=True, browser='chrome', no_sandbox=True,
                    agent=random.choice(user_agents),
                    do_not_track=True, headless=False,
                    #disable_gpu=True
                )
                driver.set_window_size(600, 600)
                last_restart_time = current_time

            start_time = time.time()
            try:
                # Refresh the page and click the captcha
                driver.uc_open_with_reconnect()
                # Check for Cloudflare captcha and click it if detected
                if "Cloudflare" in driver.page_source:
                    logging.info("Cloudflare captcha detected. Attempting to click the captcha.")
                    driver.uc_gui_click_captcha()
                logging.info("Passed the cloudflare captcha")
                # Fetch the first three tokens from the page
                tokens = []
                WebDriverWait(driver, 5).until(
                    EC.presence_of_all_elements_located((By.CSS_SELECTOR, 'a.custom-p8lifi'))
                )
                for i in range(1, 4):
                    try:
                        token_link = driver.find_element(By.CSS_SELECTOR, f'a.custom-p8lifi:nth-child({i})')
                        if token_link is None:
                            raise ValueError(f"Token link for token {i} is None")

                        token_name, amount = parse_token_info(token_link.text)
                        if token_name is None or amount is None:
                            raise ValueError(f"Token name or amount for token {i} is None")

                        href = token_link.get_attribute('href')
                        if href is None:
                            raise ValueError(f"Href for token {i} is None")

                        tokens.append({
                            'name': sanitize_token_name(token_name),
                            'amount': amount,
                            'url': href,
                        })
                        logging.info("Fetched token %s: %s", i, token_name)

                    except Exception as e:
                        logging.warning("Could not fetch token %s: %s", i, str(e))
                        #await bot.send_message(CHANNEL_ID, f"🚨Token Fetching Error at {current_time}: \n{str(e)}")
                
                end_time = time.time()
                elapsed_time = end_time - start_time
                timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                logging.info("Elapsed time: %s seconds", elapsed_time)

                # Save the elapsed time to a file for later visualization
                with open("token_fetch_times.txt", "a", encoding="utf-8") as f:
                    f.write(f"{timestamp},{elapsed_time}\n")

                yield tokens, elapsed_time

                # Refresh the page instead of quitting the driver
                driver.refresh()
                
                # Wait a bit before the next fetch to control the frequency of fetching
                await asyncio.sleep(random.randint(1, 2))
                
            except Exception as e:
                logging.error("Error fetching tokens: %s", e)
                driver.refresh()
                #await bot.send_message(CHANNEL_ID, f"🚨Token Fetching Error at {current_time}: \n{str(e)}")

    except Exception as e:
        logging.error("Failed to initialize driver: %s", str(e))
        await bot.send_message(CHANNEL_ID, f"🚨 Driver Initialization Error: {str(e)}")
        if driver:
            driver.quit()

0

Your Answer

By clicking “Post Your Answer”, you agree to our terms of service and acknowledge you have read our privacy policy.

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.