Python-World · chavarera · Oct 4, 2020 · Oct 2, 2020 · Oct 2, 2020 · Oct 3, 2020
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,2 @@
+__pycache__/
+.vscode/
diff --git a/joble/scraper/google.py b/joble/scraper/google.py
@@ -0,0 +1,37 @@
+import argparse
+import requests
+from bs4 import BeautifulSoup
+
+# function to remove random characters from the end of the url if exists
+def fix_url(url_list):
+    if len(url_list[-1]) > 8:
+        url_list[-1] = url_list[-1].split('&')[0]
+    return url_list
+
+# return the link for carrers page from google
+def get_carrer_page(name):
+    query = name.replace(' ', '+')
+    URL = f"https://google.com/search?q={query}+carrers"
+
+    resp = requests.get(URL)
+    if '.' in name:
+        name = name.split('.')[0]
+
+    if resp.status_code == 200:
+        soup = BeautifulSoup(resp.content, "html.parser")
+
+        for a in soup.find_all('a', href=True):
+            if 'url' in a['href']:
+                # removes '/url?q=' from the start of the url
+                url_list = a['href'][7:].split('/')
+                if name in url_list[2]:
+                    return '/'.join(fix_url(url_list))
+                    break
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument("name", help="name of the company", type=str)
+    args = parser.parse_args()
+
+    url = get_carrer_page(args.name)
+    print(url)
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,6 @@
 attrs==19.3.0
 Automat==20.2.0
+beautifulsoup4==4.9.2
 cffi==1.14.0
 constantly==15.1.0
 cryptography==2.9.2
@@ -19,6 +20,7 @@ PyDispatcher==2.0.5
 PyHamcrest==2.0.2
 pyOpenSSL==19.1.0
 queuelib==1.5.0
+requests==2.24.0
 Scrapy==2.2.0
 service-identity==18.1.0
 six==1.15.0