Selenium Webscraper for Odd data analyses

Question

I have written the following code to achieve the following:

Regularly scrape all live matches on the betting site oddsportal.com
Pull odds data into data frame
Evaluate data frame for two odd providers (Asianodds, Pinnacle) and compare actual data against pre-defined patterns
Send a telegram message if a pattern was identified
Save scraped links in JSON file, so that they do not become scraped again

My code still has the following issues, that I am hoping this review may help with:

Performance: Currently it takes 1-2m per game to scrape and analyse it. How can this be achieved faster/more efficiently?
At times when there are many matches running simultaneously, the script cannot scrape all matches before a cronjob starts up the next script and they clash. How can I make selenium check if there is already an instance running and wait for that one to finish?

'''

#!/Library/Frameworks/Python.framework/Versions/3.8/bin/python3

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from multiprocessing import Process
#from DbManager import DatabaseManager
import json
import re
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from webdriver_manager.chrome import ChromeDriverManager
import datetime
import time
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import NoSuchElementException
from selenium.webdriver.support import expected_conditions as EC
import requests
import cProfile


o_u_types= [1.50,1.75,2.00,2.25,2.50,2.75,3.00,3.25,3.50]
raw_time=str(datetime.datetime.now())
current_date= raw_time[0:10]
bookmakers=['Asianodds','Pinnacle']
countries=['England','Japan','France','Germany', 'India', 'Chile', 'Italy','Turkey', 'Czech Republic', 'Spain', 'Colombia','Poland','Belgium','Romania','Paraguay',
            'Portugal', 'Netherlands','Cyprus','Mexico','Brazil','Uruguay','Serbia','Slovenia','Slovakia','Sweden', 'Norway','USA','Estonia']
limited_league_countries=['England','Germany','Italy','Spain']
leagues=['National League','Championship','League One','2. Bundesliga','3. Liga','Regionalliga West','Regionalliga Sudwest','Serie A','Serie B','LaLiga','LaLiga2']



base_url='https://api.telegram.org/bot'
bot_token='xxxx'
chat_id='-xxxx'

global TYPE_ODDS
TYPE_ODDS = 'OPENING' # you can change to 'OPENING' if you want to collect opening odds, any other value will make the program collect CLOSING odds


link='https://www.oddsportal.com/inplay-odds/live-now/soccer/'

class Oddsportal:
    def ReadScrapedLinks(self):
        #with open("Modules/Config/scraped.json") as file:
        with open("Config/scraped.json") as file:
            data = json.load(file)
        return data["scraped"]
    
    def SaveScrapedMatch(self, link):
        # with open("Modules/Config/scraped.json") as oldfile:
        with open("Config/scraped.json") as oldfile:

            data = json.load(oldfile)
        data["scraped"].append(link)
        # with open("Modules/Config/scraped.json", "w+") as newfile:
        with open("Config/scraped.json", "w+") as newfile:

            json.dump(data, newfile, indent=4)  

    def filter_list(self,links):
        scraped_links=self.ReadScrapedLinks()
        self.scraped_links=[]
        for link in links:
            if link in scraped_links:
                continue
            self.filtered_links.append(link)
    
    def FindByCSSAndAttribute(self,mobject, css, attribute):
        try:
            return mobject.find_element_by_css_selector(css).get_attribute(attribute)
        except:
            return False   

    def WaitForObjects(self,type, string):
        return WebDriverWait(self.driver, 5).until(EC.presence_of_all_elements_located((type, string)))

    def fi(self,a):
        try:
            self.driver.find_element_by_xpath(a).text
        except:
            return False

    def ffi(self,a):
        if self.fi(a) != False :
            return self.driver.find_element_by_xpath(a).text
                
    def fffi(self,a):
        if TYPE_ODDS == 'OPENING':
            try:
                return get_opening_odd(a) 
            except:
                return self.ffi(a)  
        else:
            return(self.ffi(a))

    def fi2(self,a):
        try:
            self.driver.find_element_by_xpath(a).click()
        except:
            return False

    def ffi2(self,a):
        if self.fi2(a) != False :
            fi2(a)
            return(True)
        else:
            return(None)

    def __init__(self):
        mobile_emulation = {
            "deviceMetrics": {"width": 360, "height": 640, "pixelRatio": 3.0},
            "userAgent": "Mozilla/5.0 (Linux; Android 4.2.1; en-us; Nexus 5 Build/JOP40D) AppleWebKit/535.19 (KHTML, like Gecko) Chrome/18.0.1025.166 Mobile Safari/535.19"}
        chrome_options = Options()
        chrome_options.add_experimental_option(
            "mobileEmulation", mobile_emulation)
        # chrome_options.add_experimental_option("debuggerAddress", "127.0.0.1:9222")
        #Initialize chrome driver
        self.driver = webdriver.Chrome(ChromeDriverManager().install()) 
        executor_url = self.driver.command_executor._url
        session_id = self.driver.session_id
        self.driver.get(executor_url)
        print (session_id)
        print (executor_url)
        res = requests.get(executor_url)
        print(res)

        


    def matchcollector(self, link):
        self.driver.get(link)    
        live_matches=WebDriverWait(self.driver, 10).until(
            EC.presence_of_all_elements_located((By.CSS_SELECTOR, '.minutes-anim')))

        print(f'There are currently {len(live_matches)} matches live.')
        
        #Collect all matches
        all_matches = self.WaitForObjects(By.CLASS_NAME, "name.table-participant")
        self.all_links=[]
        for match_link in all_matches:
                #Get match link
                link = self.FindByCSSAndAttribute(match_link, 'a', 'href')  
                try:
                    in_play_addendum=link.split('/')[-2]
                    #Remove in play addendum
                    modified_link=link.replace(in_play_addendum,"")
                    self.all_links.append(modified_link)
                except:
                    continue

        scraped_links=self.ReadScrapedLinks()
        self.filtered_links = []
        #Remove already scraped links
        for link in self.all_links:
             if link in scraped_links:
                continue
             self.filtered_links.append(link)
        print(f'Of all matches, {len(self.filtered_links)} have not yet been checked.')
   


    def openmatch(self,match):
        try:
            self.driver.get(match)
            time.sleep(1)
            self.driver.maximize_window()
        except:
            return False
               


    def getodds(self): 

        master_df= pd.DataFrame()
        for link in self.filtered_links:
            self.openmatch(link)
            country= self.ffi('//*[@id="breadcrumb"]/a[3]')

            if country in countries:
                league= self.ffi('//*[@id="breadcrumb"]/a[4]')
                
                if country in limited_league_countries:
                    if league not in leagues:
                        continue
                    else: pass
                else: pass
                match = self.ffi('//*[@id="col-content"]/h1') 
                game_message_string=f'Checking  {match}'
                game_method= '/sendMessage?chat_id={}&text="{}"'.format(chat_id,game_message_string)

                game_telegram_url= base_url + bot_token + game_method
                requests.get(game_telegram_url)  
                final_score = self.ffi('//*[@id="event-status"]')
                date = self.ffi('//*[@id="col-content"]/p[1]') # Date and time
                game_df= pd.DataFrame()

                for i in o_u_types:
                    url_appendix="#over-under;2;{};0".format(i)
                    o_u_type= i
                    o_u_match_url= str(link)+str(url_appendix)
                    print(o_u_match_url)
                    self.loadoddpage(o_u_match_url)

                    for x in range(1,28):
                        L=[]


                        for j in range(1,15): # only first 10 bookmakers displayed
                            Book = self.ffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[1]/div/a[2]'.format(x,j)) # first bookmaker name
                            if Book in bookmakers:
                                #Fix difference for Pin in live
                                if Book=='Asianodds':
                                    Over = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[3]/div'.format(x,j)) # first home odd
                                    Under = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[4]/div'.format(x,j)) # first away odd
                                elif Book=='Pinnacle':
                                    Over = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[3]/div'.format(x,j))
                                    Under = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[4]/div'.format(x,j)) # first away odd

                                    if Over==None:
                                        Over = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[3]/a'.format(x,j)) # first home odd
                                    else: 
                                        continue
                                    if Under==None:
                                        Under = self.fffi('//*[@id="odds-data-table"]/div[{}]/table/tbody/tr[{}]/td[4]/a'.format(x,j)) # first away odd
                                    else:
                                        continue
                                print(match, country, league, Book,Over,Under, date, final_score, link, '/ 500 ')
                                L = L + [(match, country, league, Book ,Over,Under, date, final_score, link)]
                                data_df = pd.DataFrame(L)

                                try:
                                    data_df.columns = ['TeamsRaw', 'Country', 'League', 'Bookmaker', 'Over', 'Under', 'DateRaw' ,'ScoreRaw','Link']
                                except:
                                    print('Function crashed, probable reason : no games scraped (empty season)')


                                ##################### FINALLY WE CLEAN THE DATA AND SAVE IT ##########################
                                '''Now we simply need to split team names, transform date, split score'''


                                #Filter out Bookmakers

                                # (a) Split team names
                                data_df["Home_id"] = [re.split(' - ',y)[0] for y in data_df["TeamsRaw"]]
                                data_df["Away_id"] = [re.split(' - ',y)[1] for y in data_df["TeamsRaw"]]
                                # (b) Transform date
                                data_df["Date"] = [re.split(', ',y)[1] for y in data_df["DateRaw"]]



                                
                                data_df["Over_{}".format(i)]=Over
                                data_df["Under_{}".format(i)]=Under

                                master_df=pd.concat([master_df,data_df])
                                game_df=pd.concat([game_df,data_df])
            else:
                print('Match not in a relevant country. Blacklisting it.')
                self.SaveScrapedMatch(link)

                continue
            
            

            try:

                #Setup Logic Operators
                game_df.drop_duplicates(keep='first',inplace=True) 
                game_df = game_df.groupby(['TeamsRaw','Bookmaker'], as_index=False).first()

                if len(game_df.index)==2:

                    for i in o_u_types:
                        try:

                            asian_over = game_df.at[0, f"Over_{i}"]
                            asian_under = game_df.at[0, f"Under_{i}"]
                            pin_over = game_df.at[1, f"Over_{i}"]
                            pin_under = game_df.at[1, f"Under_{i}"]

                            if asian_over > pin_over:
                                game_df[f"overdominant_{i}"] = "AsianDominant"
                            elif asian_over < pin_over:
                                game_df[f"overdominant_{i}"] = "PinDominant"
                            else:
                                game_df[f"overdominant_{i}"] = "Parity"


                            if asian_under > pin_under:
                                game_df[f"underdominant_{i}"] = "AsianDominant"
                            elif asian_under < pin_under:
                                game_df[f"underdominant_{i}"] = "PinDominant"
                            else:
                                game_df[f"underdominant_{i}"] = "Parity"

                        except:
                            game_df[f"overdominant_{i}"] ='n/a'
                            game_df[f"underdominant_{i}"] = "n/a"

                            continue

                    check_row= game_df.drop([1])
                    check_row_match= check_row.TeamsRaw.values
                    check_row_country= check_row.Country.values
                    check_row_league= check_row.League.values
                    print(check_row)


                    king_m5= check_row[(check_row['underdominant_2.25']=='AsianDominant') & (check_row['underdominant_2.5']=='AsianDominant') 
                            & ((check_row['underdominant_1.5']=='Parity') | (check_row['underdominant_1.5']=='n/a')) 
                            & ((check_row['underdominant_1.75']=='Parity') | (check_row['underdominant_1.75']=='n/a'))
                            & ((check_row['underdominant_2.0']=='Parity') | (check_row['underdominant_2.0']=='n/a'))
                            & ((check_row['underdominant_2.75']=='Parity') | (check_row['underdominant_2.75']=='n/a'))
                            & ((check_row['underdominant_3.0']=='Parity') | (check_row['underdominant_3.0']=='n/a'))
                            & ((check_row['underdominant_3.25']=='Parity') | (check_row['underdominant_3.25']=='n/a'))
                            & ((check_row['underdominant_3.5']=='Parity') | (check_row['underdominant_3.5']=='n/a'))
                            & ((check_row['overdominant_1.5']=='Parity') | (check_row['overdominant_1.5']=='n/a'))
                            & ((check_row['overdominant_1.75']=='Parity') | (check_row['overdominant_1.75']=='n/a'))
                            & ((check_row['overdominant_2.0']=='Parity') | (check_row['overdominant_2.0']=='n/a'))
                            & ((check_row['overdominant_2.25']=='Parity') | (check_row['overdominant_2.25']=='n/a'))
                            & ((check_row['overdominant_2.5']=='Parity') | (check_row['overdominant_2.5']=='n/a'))
                            & ((check_row['overdominant_2.75']=='Parity') | (check_row['overdominant_2.75']=='n/a'))
                            & ((check_row['overdominant_3.0']=='Parity') | (check_row['overdominant_3.0']=='n/a'))
                            & ((check_row['overdominant_3.25']=='Parity') | (check_row['overdominant_3.25']=='n/a'))
                            & ((check_row['overdominant_3.5']=='Parity') | (check_row['overdominant_3.5']=='n/a'))            
                            ]
                    if not king_m5.empty:
                        print(f'King M5 pattern found in {check_row_match}')
                        
                        message_string=f' M5-U1,5 ⚠️👑 👑 in {check_row_country}, {check_row_league}, {check_row_match}'
                        method= '/sendMessage?chat_id={}&text="{}"'.format(chat_id,message_string)

                        telegram_url= base_url + bot_token + method
                        print(telegram_url)
                        requests.get(telegram_url)                

                    else:
                        print (f'Match {check_row_match} does not contain a king M5 pattern.')


                else:
                    print('Match does not contain both bookmakers. Blacklisting it.')
                    self.SaveScrapedMatch(link)

                    continue
            
                self.SaveScrapedMatch(link)
                
            except:
                self.SaveScrapedMatch(link)
                continue



    def main():
    op=Oddsportal()
    op.matchcollector(link)
    op.checklink()
    op.getodds()

  

if __name__== "__main__":
    #p1 = Process(target=main)
    #p1.start()
    cProfile.run('main()', filename='report.txt', sort=-1)

'''

Stack Exchange Network

Selenium Webscraper for Odd data analyses

0

You must log in to answer this question.

Hot Network Questions

Selenium Webscraper for Odd data analyses

0

You must log in to answer this question.

Related

Hot Network Questions