[Early stage] Add selenium webdriver mechanic to bypass cloudflare

This commit is contained in:
poka 2020-05-08 06:48:26 +02:00
parent 79e3106c08
commit 2968a2d312
4 changed files with 66 additions and 17 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ __pycache__/
yggcrawl/__pycache__/
yggcrawl/__init__.pyc
login.py
yggcrawl/gecko/geckodriver.log

View File

@ -104,11 +104,10 @@ def downloadTorrent():
# Download torrent file
if(scraper.login(login.user, login.passwd)):
print(colored("Login success", 'green'))
subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash')
subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
scraper.download_from_torrent_url(research)
# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/')
os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/')
# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read()
os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/')
else:
print(colored("Login failed", 'red'))
sys.exit(1)

View File

@ -0,0 +1,44 @@
#!/usr/bin/python3
# Early exemple of how to use selenium with gecko to bypass cloudflare bots detections
# The only way to block this should be using of captcha in front of every yggtorrent pages by sessions...
import sys
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
# Exit if no arguments
if len(sys.argv)==1: sys.exit("Please choose a film ou serie name")
else: arg1 = sys.argv[1]
search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed"
# Load webdriver with Gecko
options = webdriver.FirefoxOptions()
options.add_argument('-headless')
driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver')
driver.get(search_url)
# Wait to bypass cloudflare
print("Page atteinte, attente de redirection anti-crawling...")
wait = WebDriverWait(driver, 10)
wait.until(lambda driver: driver.current_url != search_url)
# Wait 2 seconds to load page
print("Anti-crawling passé, affichage dans 2 secondes ...")
time.sleep(2)
# Filter torrent urls
elems = driver.find_elements_by_css_selector(".results [href]")
links = [elem.get_attribute('href') for elem in elems]
links = [k for k in links if '/torrent/' in k]
# Print torrents urls
print("\n".join(links))
driver.quit()

View File

@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do="
YGGTORRENT_SEARCH_URL_PAGE = "&page="
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent="
YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"
@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50
YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
def set_yggtorrent_tld(yggtorrent_tld=None):
"""
@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None):
YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="
YGGTORRENT_DOMAIN = ".yggtorrent.gg"
YGGTORRENT_DOMAIN = ".yggtorrent.se"
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
@ -108,7 +109,7 @@ class YggTorrentScraper:
"User-Agent": "PostmanRuntime/7.17.1",
"Accept": "*/*",
"Cache-Control": "no-cache",
"Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
"Host": f"www2.yggtorrent.{YGGTORRENT_TLD}",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
@ -145,7 +146,7 @@ class YggTorrentScraper:
"""
Logout request
"""
response = self.session.get(YGGTORRENT_LOGOUT_URL)
response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers)
self.session.cookies.clear()
@ -160,12 +161,18 @@ class YggTorrentScraper:
return False
def search(self, parameters):
#kopa
def search_old(self, parameters):
search_url = create_search_url(parameters)
torrents_url = self.get_torrents_url(search_url, parameters)
return torrents_url
def search(self, parameters):
# torrents_url = os.popen('gecko/torrent_search.py didier')
torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read())
return torrents_url
def extract_details(self, torrent_url):
"""
Extract informations from torrent's url
@ -174,7 +181,7 @@ class YggTorrentScraper:
torrents = []
response = self.session.get(torrent_url)
response = self.session.get(torrent_url, headers=headers)
torrent_page = BeautifulSoup(response.content, features="lxml")
@ -237,7 +244,7 @@ class YggTorrentScraper:
"input", {"type": "hidden", "name": "target"}
)["value"]
response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)
response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers)
files_page = BeautifulSoup(response.content, features="lxml")
@ -292,12 +299,12 @@ class YggTorrentScraper:
return torrents_url
#kopaa
def get_torrents_url(self, search_url, parameters):
"""
Return
"""
response = self.session.get(search_url)
response = self.session.get(search_url, headers=headers)
search_page = BeautifulSoup(response.content, features="lxml")
@ -317,7 +324,7 @@ class YggTorrentScraper:
search_url = create_search_url(parameters)
response = self.session.get(search_url)
response = self.session.get(search_url, headers=headers)
search_page = BeautifulSoup(response.content, features="lxml")
@ -328,7 +335,6 @@ class YggTorrentScraper:
return torrents
#kopa
def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
if torrent_url is not None:
torrent = self.extract_details(torrent_url)
@ -349,7 +355,7 @@ class YggTorrentScraper:
if torrent_url is None:
raise Exception("Invalid torrent_url, make sure you are logged")
response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)
response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers)
temp_file_name = response.headers.get("content-disposition")
@ -368,7 +374,6 @@ class YggTorrentScraper:
return file_full_path
def create_search_url(parameters):
"""
Return a formated URL for torrent's search