From 2968a2d312355493e8659840165ec3d6c6f0a3c9 Mon Sep 17 00:00:00 2001 From: poka Date: Fri, 8 May 2020 06:48:26 +0200 Subject: [PATCH] [Early stage] Add selenium webdriver mechanic to bypass cloudflare --- .gitignore | 1 + crawl.py | 7 +++-- yggcrawl/gecko/torrent_search.py | 44 ++++++++++++++++++++++++++++++++ yggcrawl/yggtorrentscraper.py | 31 ++++++++++++---------- 4 files changed, 66 insertions(+), 17 deletions(-) create mode 100755 yggcrawl/gecko/torrent_search.py diff --git a/.gitignore b/.gitignore index 951fe8d..9d9de72 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,4 @@ __pycache__/ yggcrawl/__pycache__/ yggcrawl/__init__.pyc login.py +yggcrawl/gecko/geckodriver.log diff --git a/crawl.py b/crawl.py index fefd517..4bb6238 100755 --- a/crawl.py +++ b/crawl.py @@ -104,11 +104,10 @@ def downloadTorrent(): # Download torrent file if(scraper.login(login.user, login.passwd)): print(colored("Login success", 'green')) - subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash') + subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE) scraper.download_from_torrent_url(research) -# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/') - os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/') - +# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read() + os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/') else: print(colored("Login failed", 'red')) sys.exit(1) diff --git a/yggcrawl/gecko/torrent_search.py b/yggcrawl/gecko/torrent_search.py new file mode 100755 index 0000000..1746e73 --- /dev/null +++ b/yggcrawl/gecko/torrent_search.py @@ -0,0 +1,44 @@ +#!/usr/bin/python3 + +# Early exemple of how to use selenium with gecko to bypass cloudflare bots detections +# The only way to block this should be using of captcha in front of every yggtorrent pages by sessions... + +import sys +import time + +from selenium import webdriver +from selenium.webdriver.common.by import By +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC + +# Exit if no arguments +if len(sys.argv)==1: sys.exit("Please choose a film ou serie name") +else: arg1 = sys.argv[1] + +search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed" + +# Load webdriver with Gecko +options = webdriver.FirefoxOptions() +options.add_argument('-headless') +driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver') +driver.get(search_url) + +# Wait to bypass cloudflare +print("Page atteinte, attente de redirection anti-crawling...") +wait = WebDriverWait(driver, 10) +wait.until(lambda driver: driver.current_url != search_url) + +# Wait 2 seconds to load page +print("Anti-crawling passé, affichage dans 2 secondes ...") +time.sleep(2) + +# Filter torrent urls +elems = driver.find_elements_by_css_selector(".results [href]") +links = [elem.get_attribute('href') for elem in elems] +links = [k for k in links if '/torrent/' in k] + +# Print torrents urls +print("\n".join(links)) + + +driver.quit() diff --git a/yggcrawl/yggtorrentscraper.py b/yggcrawl/yggtorrentscraper.py index d9c0802..66a2dd3 100644 --- a/yggcrawl/yggtorrentscraper.py +++ b/yggcrawl/yggtorrentscraper.py @@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do=" YGGTORRENT_SEARCH_URL_PAGE = "&page=" YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent=" -YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent=" +YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent=" YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted" @@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50 YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent=" +headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'} def set_yggtorrent_tld(yggtorrent_tld=None): """ @@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None): YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name=" - YGGTORRENT_DOMAIN = ".yggtorrent.gg" + YGGTORRENT_DOMAIN = ".yggtorrent.se" YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent=" YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent=" @@ -108,7 +109,7 @@ class YggTorrentScraper: "User-Agent": "PostmanRuntime/7.17.1", "Accept": "*/*", "Cache-Control": "no-cache", - "Host": f"www.yggtorrent.{YGGTORRENT_TLD}", + "Host": f"www2.yggtorrent.{YGGTORRENT_TLD}", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } @@ -145,7 +146,7 @@ class YggTorrentScraper: """ Logout request """ - response = self.session.get(YGGTORRENT_LOGOUT_URL) + response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers) self.session.cookies.clear() @@ -160,12 +161,18 @@ class YggTorrentScraper: return False - def search(self, parameters): + #kopa + def search_old(self, parameters): search_url = create_search_url(parameters) torrents_url = self.get_torrents_url(search_url, parameters) return torrents_url + def search(self, parameters): +# torrents_url = os.popen('gecko/torrent_search.py didier') + torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read()) + return torrents_url + def extract_details(self, torrent_url): """ Extract informations from torrent's url @@ -174,7 +181,7 @@ class YggTorrentScraper: torrents = [] - response = self.session.get(torrent_url) + response = self.session.get(torrent_url, headers=headers) torrent_page = BeautifulSoup(response.content, features="lxml") @@ -237,7 +244,7 @@ class YggTorrentScraper: "input", {"type": "hidden", "name": "target"} )["value"] - response = self.session.get(YGGTORRENT_GET_FILES + torrent_id) + response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers) files_page = BeautifulSoup(response.content, features="lxml") @@ -292,12 +299,12 @@ class YggTorrentScraper: return torrents_url +#kopaa def get_torrents_url(self, search_url, parameters): """ Return """ - - response = self.session.get(search_url) + response = self.session.get(search_url, headers=headers) search_page = BeautifulSoup(response.content, features="lxml") @@ -317,7 +324,7 @@ class YggTorrentScraper: search_url = create_search_url(parameters) - response = self.session.get(search_url) + response = self.session.get(search_url, headers=headers) search_page = BeautifulSoup(response.content, features="lxml") @@ -328,7 +335,6 @@ class YggTorrentScraper: return torrents -#kopa def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"): if torrent_url is not None: torrent = self.extract_details(torrent_url) @@ -349,7 +355,7 @@ class YggTorrentScraper: if torrent_url is None: raise Exception("Invalid torrent_url, make sure you are logged") - response = self.session.get(YGGTORRENT_BASE_URL + torrent_url) + response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers) temp_file_name = response.headers.get("content-disposition") @@ -368,7 +374,6 @@ class YggTorrentScraper: return file_full_path - def create_search_url(parameters): """ Return a formated URL for torrent's search