[Early stage] Add selenium webdriver mechanic to bypass cloudflare
This commit is contained in:
parent
79e3106c08
commit
2968a2d312
|
@ -3,3 +3,4 @@ __pycache__/
|
||||||
yggcrawl/__pycache__/
|
yggcrawl/__pycache__/
|
||||||
yggcrawl/__init__.pyc
|
yggcrawl/__init__.pyc
|
||||||
login.py
|
login.py
|
||||||
|
yggcrawl/gecko/geckodriver.log
|
||||||
|
|
7
crawl.py
7
crawl.py
|
@ -104,11 +104,10 @@ def downloadTorrent():
|
||||||
# Download torrent file
|
# Download torrent file
|
||||||
if(scraper.login(login.user, login.passwd)):
|
if(scraper.login(login.user, login.passwd)):
|
||||||
print(colored("Login success", 'green'))
|
print(colored("Login success", 'green'))
|
||||||
subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash')
|
subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||||
scraper.download_from_torrent_url(research)
|
scraper.download_from_torrent_url(research)
|
||||||
# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/')
|
# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read()
|
||||||
os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/')
|
os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/')
|
||||||
|
|
||||||
else:
|
else:
|
||||||
print(colored("Login failed", 'red'))
|
print(colored("Login failed", 'red'))
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
|
@ -0,0 +1,44 @@
|
||||||
|
#!/usr/bin/python3
|
||||||
|
|
||||||
|
# Early exemple of how to use selenium with gecko to bypass cloudflare bots detections
|
||||||
|
# The only way to block this should be using of captcha in front of every yggtorrent pages by sessions...
|
||||||
|
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
|
from selenium import webdriver
|
||||||
|
from selenium.webdriver.common.by import By
|
||||||
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
|
from selenium.webdriver.support import expected_conditions as EC
|
||||||
|
|
||||||
|
# Exit if no arguments
|
||||||
|
if len(sys.argv)==1: sys.exit("Please choose a film ou serie name")
|
||||||
|
else: arg1 = sys.argv[1]
|
||||||
|
|
||||||
|
search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed"
|
||||||
|
|
||||||
|
# Load webdriver with Gecko
|
||||||
|
options = webdriver.FirefoxOptions()
|
||||||
|
options.add_argument('-headless')
|
||||||
|
driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver')
|
||||||
|
driver.get(search_url)
|
||||||
|
|
||||||
|
# Wait to bypass cloudflare
|
||||||
|
print("Page atteinte, attente de redirection anti-crawling...")
|
||||||
|
wait = WebDriverWait(driver, 10)
|
||||||
|
wait.until(lambda driver: driver.current_url != search_url)
|
||||||
|
|
||||||
|
# Wait 2 seconds to load page
|
||||||
|
print("Anti-crawling passé, affichage dans 2 secondes ...")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
# Filter torrent urls
|
||||||
|
elems = driver.find_elements_by_css_selector(".results [href]")
|
||||||
|
links = [elem.get_attribute('href') for elem in elems]
|
||||||
|
links = [k for k in links if '/torrent/' in k]
|
||||||
|
|
||||||
|
# Print torrents urls
|
||||||
|
print("\n".join(links))
|
||||||
|
|
||||||
|
|
||||||
|
driver.quit()
|
|
@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do="
|
||||||
YGGTORRENT_SEARCH_URL_PAGE = "&page="
|
YGGTORRENT_SEARCH_URL_PAGE = "&page="
|
||||||
|
|
||||||
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
||||||
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
|
YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent="
|
||||||
|
|
||||||
YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"
|
YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"
|
||||||
|
|
||||||
|
@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50
|
||||||
|
|
||||||
YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
||||||
|
|
||||||
|
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||||
|
|
||||||
def set_yggtorrent_tld(yggtorrent_tld=None):
|
def set_yggtorrent_tld(yggtorrent_tld=None):
|
||||||
"""
|
"""
|
||||||
|
@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None):
|
||||||
|
|
||||||
YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="
|
YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="
|
||||||
|
|
||||||
YGGTORRENT_DOMAIN = ".yggtorrent.gg"
|
YGGTORRENT_DOMAIN = ".yggtorrent.se"
|
||||||
|
|
||||||
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
||||||
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
|
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
|
||||||
|
@ -108,7 +109,7 @@ class YggTorrentScraper:
|
||||||
"User-Agent": "PostmanRuntime/7.17.1",
|
"User-Agent": "PostmanRuntime/7.17.1",
|
||||||
"Accept": "*/*",
|
"Accept": "*/*",
|
||||||
"Cache-Control": "no-cache",
|
"Cache-Control": "no-cache",
|
||||||
"Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
|
"Host": f"www2.yggtorrent.{YGGTORRENT_TLD}",
|
||||||
"Accept-Encoding": "gzip, deflate",
|
"Accept-Encoding": "gzip, deflate",
|
||||||
"Connection": "keep-alive",
|
"Connection": "keep-alive",
|
||||||
}
|
}
|
||||||
|
@ -145,7 +146,7 @@ class YggTorrentScraper:
|
||||||
"""
|
"""
|
||||||
Logout request
|
Logout request
|
||||||
"""
|
"""
|
||||||
response = self.session.get(YGGTORRENT_LOGOUT_URL)
|
response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers)
|
||||||
|
|
||||||
self.session.cookies.clear()
|
self.session.cookies.clear()
|
||||||
|
|
||||||
|
@ -160,12 +161,18 @@ class YggTorrentScraper:
|
||||||
|
|
||||||
return False
|
return False
|
||||||
|
|
||||||
def search(self, parameters):
|
#kopa
|
||||||
|
def search_old(self, parameters):
|
||||||
search_url = create_search_url(parameters)
|
search_url = create_search_url(parameters)
|
||||||
torrents_url = self.get_torrents_url(search_url, parameters)
|
torrents_url = self.get_torrents_url(search_url, parameters)
|
||||||
|
|
||||||
return torrents_url
|
return torrents_url
|
||||||
|
|
||||||
|
def search(self, parameters):
|
||||||
|
# torrents_url = os.popen('gecko/torrent_search.py didier')
|
||||||
|
torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read())
|
||||||
|
return torrents_url
|
||||||
|
|
||||||
def extract_details(self, torrent_url):
|
def extract_details(self, torrent_url):
|
||||||
"""
|
"""
|
||||||
Extract informations from torrent's url
|
Extract informations from torrent's url
|
||||||
|
@ -174,7 +181,7 @@ class YggTorrentScraper:
|
||||||
|
|
||||||
torrents = []
|
torrents = []
|
||||||
|
|
||||||
response = self.session.get(torrent_url)
|
response = self.session.get(torrent_url, headers=headers)
|
||||||
|
|
||||||
torrent_page = BeautifulSoup(response.content, features="lxml")
|
torrent_page = BeautifulSoup(response.content, features="lxml")
|
||||||
|
|
||||||
|
@ -237,7 +244,7 @@ class YggTorrentScraper:
|
||||||
"input", {"type": "hidden", "name": "target"}
|
"input", {"type": "hidden", "name": "target"}
|
||||||
)["value"]
|
)["value"]
|
||||||
|
|
||||||
response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)
|
response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers)
|
||||||
|
|
||||||
files_page = BeautifulSoup(response.content, features="lxml")
|
files_page = BeautifulSoup(response.content, features="lxml")
|
||||||
|
|
||||||
|
@ -292,12 +299,12 @@ class YggTorrentScraper:
|
||||||
|
|
||||||
return torrents_url
|
return torrents_url
|
||||||
|
|
||||||
|
#kopaa
|
||||||
def get_torrents_url(self, search_url, parameters):
|
def get_torrents_url(self, search_url, parameters):
|
||||||
"""
|
"""
|
||||||
Return
|
Return
|
||||||
"""
|
"""
|
||||||
|
response = self.session.get(search_url, headers=headers)
|
||||||
response = self.session.get(search_url)
|
|
||||||
|
|
||||||
search_page = BeautifulSoup(response.content, features="lxml")
|
search_page = BeautifulSoup(response.content, features="lxml")
|
||||||
|
|
||||||
|
@ -317,7 +324,7 @@ class YggTorrentScraper:
|
||||||
|
|
||||||
search_url = create_search_url(parameters)
|
search_url = create_search_url(parameters)
|
||||||
|
|
||||||
response = self.session.get(search_url)
|
response = self.session.get(search_url, headers=headers)
|
||||||
|
|
||||||
search_page = BeautifulSoup(response.content, features="lxml")
|
search_page = BeautifulSoup(response.content, features="lxml")
|
||||||
|
|
||||||
|
@ -328,7 +335,6 @@ class YggTorrentScraper:
|
||||||
|
|
||||||
return torrents
|
return torrents
|
||||||
|
|
||||||
#kopa
|
|
||||||
def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
|
def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
|
||||||
if torrent_url is not None:
|
if torrent_url is not None:
|
||||||
torrent = self.extract_details(torrent_url)
|
torrent = self.extract_details(torrent_url)
|
||||||
|
@ -349,7 +355,7 @@ class YggTorrentScraper:
|
||||||
if torrent_url is None:
|
if torrent_url is None:
|
||||||
raise Exception("Invalid torrent_url, make sure you are logged")
|
raise Exception("Invalid torrent_url, make sure you are logged")
|
||||||
|
|
||||||
response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)
|
response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers)
|
||||||
|
|
||||||
temp_file_name = response.headers.get("content-disposition")
|
temp_file_name = response.headers.get("content-disposition")
|
||||||
|
|
||||||
|
@ -368,7 +374,6 @@ class YggTorrentScraper:
|
||||||
|
|
||||||
return file_full_path
|
return file_full_path
|
||||||
|
|
||||||
|
|
||||||
def create_search_url(parameters):
|
def create_search_url(parameters):
|
||||||
"""
|
"""
|
||||||
Return a formated URL for torrent's search
|
Return a formated URL for torrent's search
|
||||||
|
|
Loading…
Reference in New Issue