From 2968a2d312355493e8659840165ec3d6c6f0a3c9 Mon Sep 17 00:00:00 2001
From: poka <poka@p2p.legal>
Date: Fri, 8 May 2020 06:48:26 +0200
Subject: [PATCH] [Early stage] Add selenium webdriver mechanic to bypass
 cloudflare

---
 .gitignore                       |  1 +
 crawl.py                         |  7 +++--
 yggcrawl/gecko/torrent_search.py | 44 ++++++++++++++++++++++++++++++++
 yggcrawl/yggtorrentscraper.py    | 31 ++++++++++++----------
 4 files changed, 66 insertions(+), 17 deletions(-)
 create mode 100755 yggcrawl/gecko/torrent_search.py

diff --git a/.gitignore b/.gitignore
index 951fe8d..9d9de72 100644
--- a/.gitignore
+++ b/.gitignore
@@ -3,3 +3,4 @@ __pycache__/
 yggcrawl/__pycache__/
 yggcrawl/__init__.pyc
 login.py
+yggcrawl/gecko/geckodriver.log
diff --git a/crawl.py b/crawl.py
index fefd517..4bb6238 100755
--- a/crawl.py
+++ b/crawl.py
@@ -104,11 +104,10 @@ def downloadTorrent():
         # Download torrent file
         if(scraper.login(login.user, login.passwd)):
             print(colored("Login success", 'green'))
-            subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash')
+            subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
             scraper.download_from_torrent_url(research)
-#            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/')
-            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/')
-
+#            os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read()
+            os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/')
         else:
             print(colored("Login failed", 'red'))
             sys.exit(1)
diff --git a/yggcrawl/gecko/torrent_search.py b/yggcrawl/gecko/torrent_search.py
new file mode 100755
index 0000000..1746e73
--- /dev/null
+++ b/yggcrawl/gecko/torrent_search.py
@@ -0,0 +1,44 @@
+#!/usr/bin/python3
+
+# Early exemple of how to use selenium with gecko to bypass cloudflare bots detections
+# The only way to block this should be using of captcha in front of every yggtorrent pages by sessions...
+
+import sys
+import time
+
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+
+# Exit if no arguments
+if len(sys.argv)==1: sys.exit("Please choose a film ou serie name")
+else: arg1 = sys.argv[1]
+
+search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed"
+
+# Load webdriver with Gecko
+options = webdriver.FirefoxOptions()
+options.add_argument('-headless')
+driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver')
+driver.get(search_url)
+
+# Wait to bypass cloudflare
+print("Page atteinte, attente de redirection anti-crawling...")
+wait = WebDriverWait(driver, 10)
+wait.until(lambda driver: driver.current_url != search_url)
+
+# Wait 2 seconds to load page
+print("Anti-crawling passé, affichage dans 2 secondes ...")
+time.sleep(2)
+
+# Filter torrent urls
+elems = driver.find_elements_by_css_selector(".results [href]")
+links = [elem.get_attribute('href') for elem in elems]
+links = [k for k in links if '/torrent/' in k]
+
+# Print torrents urls
+print("\n".join(links))
+
+
+driver.quit()
diff --git a/yggcrawl/yggtorrentscraper.py b/yggcrawl/yggtorrentscraper.py
index d9c0802..66a2dd3 100644
--- a/yggcrawl/yggtorrentscraper.py
+++ b/yggcrawl/yggtorrentscraper.py
@@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do="
 YGGTORRENT_SEARCH_URL_PAGE = "&page="
 
 YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
-YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
+YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent="
 
 YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"
 
@@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50
 
 YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
 
+headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
 
 def set_yggtorrent_tld(yggtorrent_tld=None):
     """
@@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None):
 
     YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="
 
-    YGGTORRENT_DOMAIN = ".yggtorrent.gg"
+    YGGTORRENT_DOMAIN = ".yggtorrent.se"
 
     YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
     YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
@@ -108,7 +109,7 @@ class YggTorrentScraper:
             "User-Agent": "PostmanRuntime/7.17.1",
             "Accept": "*/*",
             "Cache-Control": "no-cache",
-            "Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
+            "Host": f"www2.yggtorrent.{YGGTORRENT_TLD}",
             "Accept-Encoding": "gzip, deflate",
             "Connection": "keep-alive",
         }
@@ -145,7 +146,7 @@ class YggTorrentScraper:
         """
         Logout request
         """
-        response = self.session.get(YGGTORRENT_LOGOUT_URL)
+        response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers)
 
         self.session.cookies.clear()
 
@@ -160,12 +161,18 @@ class YggTorrentScraper:
 
             return False
 
-    def search(self, parameters):
+    #kopa
+    def search_old(self, parameters):
         search_url = create_search_url(parameters)
         torrents_url = self.get_torrents_url(search_url, parameters)
 
         return torrents_url
 
+    def search(self, parameters):
+#        torrents_url = os.popen('gecko/torrent_search.py didier')
+        torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read())
+        return torrents_url
+
     def extract_details(self, torrent_url):
         """
         Extract informations from torrent's url
@@ -174,7 +181,7 @@ class YggTorrentScraper:
 
         torrents = []
 
-        response = self.session.get(torrent_url)
+        response = self.session.get(torrent_url, headers=headers)
 
         torrent_page = BeautifulSoup(response.content, features="lxml")
 
@@ -237,7 +244,7 @@ class YggTorrentScraper:
             "input", {"type": "hidden", "name": "target"}
         )["value"]
 
-        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)
+        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers)
 
         files_page = BeautifulSoup(response.content, features="lxml")
 
@@ -292,12 +299,12 @@ class YggTorrentScraper:
 
         return torrents_url
 
+#kopaa
     def get_torrents_url(self, search_url, parameters):
         """
         Return
         """
-
-        response = self.session.get(search_url)
+        response = self.session.get(search_url, headers=headers)
 
         search_page = BeautifulSoup(response.content, features="lxml")
 
@@ -317,7 +324,7 @@ class YggTorrentScraper:
 
             search_url = create_search_url(parameters)
 
-            response = self.session.get(search_url)
+            response = self.session.get(search_url, headers=headers)
 
             search_page = BeautifulSoup(response.content, features="lxml")
 
@@ -328,7 +335,6 @@ class YggTorrentScraper:
 
         return torrents
 
-#kopa
     def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
         if torrent_url is not None:
             torrent = self.extract_details(torrent_url)
@@ -349,7 +355,7 @@ class YggTorrentScraper:
         if torrent_url is None:
             raise Exception("Invalid torrent_url, make sure you are logged")
 
-        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)
+        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers)
 
         temp_file_name = response.headers.get("content-disposition")
 
@@ -368,7 +374,6 @@ class YggTorrentScraper:
 
         return file_full_path
 
-
 def create_search_url(parameters):
     """
     Return a formated URL for torrent's search