diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..3040d64 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +__pycache__/ +.vscode/ \ No newline at end of file diff --git a/joble/scraper/google.py b/joble/scraper/google.py new file mode 100644 index 0000000..72f167f --- /dev/null +++ b/joble/scraper/google.py @@ -0,0 +1,37 @@ +import argparse +import requests +from bs4 import BeautifulSoup + +# function to remove random characters from the end of the url if exists +def fix_url(url_list): + if len(url_list[-1]) > 8: + url_list[-1] = url_list[-1].split('&')[0] + return url_list + +# return the link for carrers page from google +def get_carrer_page(name): + query = name.replace(' ', '+') + URL = f"https://google.com/search?q={query}+carrers" + + resp = requests.get(URL) + if '.' in name: + name = name.split('.')[0] + + if resp.status_code == 200: + soup = BeautifulSoup(resp.content, "html.parser") + + for a in soup.find_all('a', href=True): + if 'url' in a['href']: + # removes '/url?q=' from the start of the url + url_list = a['href'][7:].split('/') + if name in url_list[2]: + return '/'.join(fix_url(url_list)) + break + +if __name__ == '__main__': + parser = argparse.ArgumentParser() + parser.add_argument("name", help="name of the company", type=str) + args = parser.parse_args() + + url = get_carrer_page(args.name) + print(url) diff --git a/requirements.txt b/requirements.txt index 3f9f5d0..7758815 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ attrs==19.3.0 Automat==20.2.0 +beautifulsoup4==4.9.2 cffi==1.14.0 constantly==15.1.0 cryptography==2.9.2 @@ -19,6 +20,7 @@ PyDispatcher==2.0.5 PyHamcrest==2.0.2 pyOpenSSL==19.1.0 queuelib==1.5.0 +requests==2.24.0 Scrapy==2.2.0 service-identity==18.1.0 six==1.15.0