[Early stage] Add selenium webdriver mechanic to bypass cloudflare

2020-05-08 06:48:26 +02:00 · 2020-05-08 06:48:26 +02:00 · 2968a2d312
parent 79e3106c08
commit 2968a2d312
4 changed files with 66 additions and 17 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@ __pycache__/
 yggcrawl/__pycache__/
 yggcrawl/__init__.pyc
 login.py
+yggcrawl/gecko/geckodriver.log
--- a/crawl.py
+++ b/crawl.py
@ -104,11 +104,10 @@ def downloadTorrent():
        # Download torrent file
        if(scraper.login(login.user, login.passwd)):
            print(colored("Login success", 'green'))
-            subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash')
+            subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
            scraper.download_from_torrent_url(research)
-#            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/')
-            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/')
-
+#            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read()
+            os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/')
        else:
            print(colored("Login failed", 'red'))
            sys.exit(1)
--- a/yggcrawl/gecko/torrent_search.py
+++ b/yggcrawl/gecko/torrent_search.py
@ -0,0 +1,44 @@
+#!/usr/bin/python3
+
+# Early exemple of how to use selenium with gecko to bypass cloudflare bots detections
+# The only way to block this should be using of captcha in front of every yggtorrent pages by sessions...
+
+import sys
+import time
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+# Exit if no arguments
+if len(sys.argv)==1: sys.exit("Please choose a film ou serie name")
+else: arg1 = sys.argv[1]
+
+search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed"
+
+# Load webdriver with Gecko
+options = webdriver.FirefoxOptions()
+options.add_argument('-headless')
+driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver')
+driver.get(search_url)
+
+# Wait to bypass cloudflare
+print("Page atteinte, attente de redirection anti-crawling...")
+wait = WebDriverWait(driver, 10)
+wait.until(lambda driver: driver.current_url != search_url)
+
+# Wait 2 seconds to load page
+print("Anti-crawling passé, affichage dans 2 secondes ...")
+time.sleep(2)
+
+# Filter torrent urls
+elems = driver.find_elements_by_css_selector(".results [href]")
+links = [elem.get_attribute('href') for elem in elems]
+links = [k for k in links if '/torrent/' in k]
+
+# Print torrents urls
+print("\n".join(links))
+
+
+driver.quit()
--- a/yggcrawl/yggtorrentscraper.py
+++ b/yggcrawl/yggtorrentscraper.py
@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do="
 YGGTORRENT_SEARCH_URL_PAGE = "&page="

 YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
-YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
+YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent="

 YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"

@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50

 YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="

+headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}

 def set_yggtorrent_tld(yggtorrent_tld=None):
    """
@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None):

    YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="

-    YGGTORRENT_DOMAIN = ".yggtorrent.gg"
+    YGGTORRENT_DOMAIN = ".yggtorrent.se"

    YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
    YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
@ -108,7 +109,7 @@ class YggTorrentScraper:
            "User-Agent": "PostmanRuntime/7.17.1",
            "Accept": "*/*",
            "Cache-Control": "no-cache",
-            "Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
+            "Host": f"www2.yggtorrent.{YGGTORRENT_TLD}",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
        }
@ -145,7 +146,7 @@ class YggTorrentScraper:
        """
        Logout request
        """
-        response = self.session.get(YGGTORRENT_LOGOUT_URL)
+        response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers)

        self.session.cookies.clear()

@ -160,12 +161,18 @@ class YggTorrentScraper:

            return False

-    def search(self, parameters):
+    #kopa
+    def search_old(self, parameters):
        search_url = create_search_url(parameters)
        torrents_url = self.get_torrents_url(search_url, parameters)

        return torrents_url

+    def search(self, parameters):
+#        torrents_url = os.popen('gecko/torrent_search.py didier')
+        torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read())
+        return torrents_url
+
    def extract_details(self, torrent_url):
        """
        Extract informations from torrent's url
@ -174,7 +181,7 @@ class YggTorrentScraper:

        torrents = []

-        response = self.session.get(torrent_url)
+        response = self.session.get(torrent_url, headers=headers)

        torrent_page = BeautifulSoup(response.content, features="lxml")

@ -237,7 +244,7 @@ class YggTorrentScraper:
            "input", {"type": "hidden", "name": "target"}
        )["value"]

-        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)
+        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers)

        files_page = BeautifulSoup(response.content, features="lxml")

@ -292,12 +299,12 @@ class YggTorrentScraper:

        return torrents_url

+#kopaa
    def get_torrents_url(self, search_url, parameters):
        """
        Return
        """
-
-        response = self.session.get(search_url)
+        response = self.session.get(search_url, headers=headers)

        search_page = BeautifulSoup(response.content, features="lxml")

@ -317,7 +324,7 @@ class YggTorrentScraper:

            search_url = create_search_url(parameters)

-            response = self.session.get(search_url)
+            response = self.session.get(search_url, headers=headers)

            search_page = BeautifulSoup(response.content, features="lxml")

@ -328,7 +335,6 @@ class YggTorrentScraper:

        return torrents

-#kopa
    def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
        if torrent_url is not None:
            torrent = self.extract_details(torrent_url)
@ -349,7 +355,7 @@ class YggTorrentScraper:
        if torrent_url is None:
            raise Exception("Invalid torrent_url, make sure you are logged")

-        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)
+        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers)

        temp_file_name = response.headers.get("content-disposition")

@ -368,7 +374,6 @@ class YggTorrentScraper:

        return file_full_path

-
 def create_search_url(parameters):
    """
    Return a formated URL for torrent's search