Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
__pycache__/
.vscode/
37 changes: 37 additions & 0 deletions joble/scraper/google.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
import argparse
import requests
from bs4 import BeautifulSoup

# function to remove random characters from the end of the url if exists
def fix_url(url_list):
if len(url_list[-1]) > 8:
url_list[-1] = url_list[-1].split('&')[0]
return url_list

# return the link for carrers page from google
def get_carrer_page(name):
query = name.replace(' ', '+')
URL = f"https://google.com/search?q={query}+carrers"

resp = requests.get(URL)
if '.' in name:
name = name.split('.')[0]

if resp.status_code == 200:
soup = BeautifulSoup(resp.content, "html.parser")

for a in soup.find_all('a', href=True):
if 'url' in a['href']:
# removes '/url?q=' from the start of the url
url_list = a['href'][7:].split('/')
if name in url_list[2]:
return '/'.join(fix_url(url_list))
break

if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument("name", help="name of the company", type=str)
args = parser.parse_args()

url = get_carrer_page(args.name)
print(url)
2 changes: 2 additions & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
attrs==19.3.0
Automat==20.2.0
beautifulsoup4==4.9.2
cffi==1.14.0
constantly==15.1.0
cryptography==2.9.2
Expand All @@ -19,6 +20,7 @@ PyDispatcher==2.0.5
PyHamcrest==2.0.2
pyOpenSSL==19.1.0
queuelib==1.5.0
requests==2.24.0
Scrapy==2.2.0
service-identity==18.1.0
six==1.15.0
Expand Down