[Early stage] Add selenium webdriver mechanic to bypass cloudflare

2020-05-08 06:48:26 +02:00 · 2020-05-08 06:48:26 +02:00 · 2968a2d312
parent 79e3106c08
commit 2968a2d312
4 changed files with 66 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@ __pycache__/
 yggcrawl/__pycache__/
 yggcrawl/__init__.pyc
 login.py
 yggcrawl/gecko/geckodriver.log
--- a/crawl.py
+++ b/crawl.py
@ -104,11 +104,10 @@ def downloadTorrent():
        # Download torrent file
        if(scraper.login(login.user, login.passwd)):
            print(colored("Login success", 'green'))
-            subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash')
+            subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            scraper.download_from_torrent_url(research)
-#            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/')
+#            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read()
-            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/')
+            os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/')
        else:
            print(colored("Login failed", 'red'))
            sys.exit(1)
--- a/yggcrawl/gecko/torrent_search.py
+++ b/yggcrawl/gecko/torrent_search.py
@ -0,0 +1,44 @@
 #!/usr/bin/python3
 # Early exemple of how to use selenium with gecko to bypass cloudflare bots detections
 # The only way to block this should be using of captcha in front of every yggtorrent pages by sessions...
 import sys
 import time
 from selenium import webdriver
 from selenium.webdriver.common.by import By
 from selenium.webdriver.support.ui import WebDriverWait
 from selenium.webdriver.support import expected_conditions as EC
 # Exit if no arguments
 if len(sys.argv)==1: sys.exit("Please choose a film ou serie name")
 else: arg1 = sys.argv[1]
 search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed"
 # Load webdriver with Gecko
 options = webdriver.FirefoxOptions()
 options.add_argument('-headless')
 driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver')
 driver.get(search_url)
 # Wait to bypass cloudflare
 print("Page atteinte, attente de redirection anti-crawling...")
 wait = WebDriverWait(driver, 10)
 wait.until(lambda driver: driver.current_url != search_url)
 # Wait 2 seconds to load page
 print("Anti-crawling passé, affichage dans 2 secondes ...")
 time.sleep(2)
 # Filter torrent urls
 elems = driver.find_elements_by_css_selector(".results [href]")
 links = [elem.get_attribute('href') for elem in elems]
 links = [k for k in links if '/torrent/' in k]
 # Print torrents urls
 print("\n".join(links))
 driver.quit()
--- a/yggcrawl/yggtorrentscraper.py
+++ b/yggcrawl/yggtorrentscraper.py
@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do="
 YGGTORRENT_SEARCH_URL_PAGE = "&page="
 YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
-YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
+YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent="
 YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"
@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50
 YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
 headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
 def set_yggtorrent_tld(yggtorrent_tld=None):
    """
@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None):
    YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="
-    YGGTORRENT_DOMAIN = ".yggtorrent.gg"
+    YGGTORRENT_DOMAIN = ".yggtorrent.se"
    YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
    YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
@ -108,7 +109,7 @@ class YggTorrentScraper:
            "User-Agent": "PostmanRuntime/7.17.1",
            "Accept": "*/*",
            "Cache-Control": "no-cache",
-            "Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
+            "Host": f"www2.yggtorrent.{YGGTORRENT_TLD}",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
        }
@ -145,7 +146,7 @@ class YggTorrentScraper:
        """
        Logout request
        """
-        response = self.session.get(YGGTORRENT_LOGOUT_URL)
+        response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers)
        self.session.cookies.clear()
@ -160,12 +161,18 @@ class YggTorrentScraper:
            return False
-    def search(self, parameters):
+    #kopa
    def search_old(self, parameters):
        search_url = create_search_url(parameters)
        torrents_url = self.get_torrents_url(search_url, parameters)
        return torrents_url
    def search(self, parameters):
 #        torrents_url = os.popen('gecko/torrent_search.py didier')
        torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read())
        return torrents_url
    def extract_details(self, torrent_url):
        """
        Extract informations from torrent's url
@ -174,7 +181,7 @@ class YggTorrentScraper:
        torrents = []
-        response = self.session.get(torrent_url)
+        response = self.session.get(torrent_url, headers=headers)
        torrent_page = BeautifulSoup(response.content, features="lxml")
@ -237,7 +244,7 @@ class YggTorrentScraper:
            "input", {"type": "hidden", "name": "target"}
        )["value"]
-        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)
+        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers)
        files_page = BeautifulSoup(response.content, features="lxml")
@ -292,12 +299,12 @@ class YggTorrentScraper:
        return torrents_url
 #kopaa
    def get_torrents_url(self, search_url, parameters):
        """
        Return
        """
-
+        response = self.session.get(search_url, headers=headers)
        response = self.session.get(search_url)
        search_page = BeautifulSoup(response.content, features="lxml")
@ -317,7 +324,7 @@ class YggTorrentScraper:
            search_url = create_search_url(parameters)
-            response = self.session.get(search_url)
+            response = self.session.get(search_url, headers=headers)
            search_page = BeautifulSoup(response.content, features="lxml")
@ -328,7 +335,6 @@ class YggTorrentScraper:
        return torrents
 #kopa
    def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
        if torrent_url is not None:
            torrent = self.extract_details(torrent_url)
@ -349,7 +355,7 @@ class YggTorrentScraper:
        if torrent_url is None:
            raise Exception("Invalid torrent_url, make sure you are logged")
-        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)
+        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers)
        temp_file_name = response.headers.get("content-disposition")
@ -368,7 +374,6 @@ class YggTorrentScraper:
        return file_full_path
 def create_search_url(parameters):
    """
    Return a formated URL for torrent's search