[Early stage] Add selenium webdriver mechanic to bypass cloudflare
This commit is contained in:
parent
79e3106c08
commit
2968a2d312
|
@ -3,3 +3,4 @@ __pycache__/
|
|||
yggcrawl/__pycache__/
|
||||
yggcrawl/__init__.pyc
|
||||
login.py
|
||||
yggcrawl/gecko/geckodriver.log
|
||||
|
|
7
crawl.py
7
crawl.py
|
@ -104,11 +104,10 @@ def downloadTorrent():
|
|||
# Download torrent file
|
||||
if(scraper.login(login.user, login.passwd)):
|
||||
print(colored("Login success", 'green'))
|
||||
subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash')
|
||||
subprocess.Popen('[[ $(ls data/tmp/torrents/) ]] && rm data/tmp/torrents/*', executable='/bin/bash', stdout=subprocess.PIPE, stderr=subprocess.PIPE)
|
||||
scraper.download_from_torrent_url(research)
|
||||
# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent}.torrent && mv *.torrent ../../torrents/')
|
||||
os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/')
|
||||
|
||||
# os.popen(f'cd data/tmp/torrents/ && mv *.torrent {idTorrent.strip()}.torrent && mv {idTorrent.strip()}.torrent ../../torrents/').read()
|
||||
os.popen('cd data/tmp/torrents/ && mv *.torrent ../../torrents/')
|
||||
else:
|
||||
print(colored("Login failed", 'red'))
|
||||
sys.exit(1)
|
||||
|
|
|
@ -0,0 +1,44 @@
|
|||
#!/usr/bin/python3
|
||||
|
||||
# Early exemple of how to use selenium with gecko to bypass cloudflare bots detections
|
||||
# The only way to block this should be using of captcha in front of every yggtorrent pages by sessions...
|
||||
|
||||
import sys
|
||||
import time
|
||||
|
||||
from selenium import webdriver
|
||||
from selenium.webdriver.common.by import By
|
||||
from selenium.webdriver.support.ui import WebDriverWait
|
||||
from selenium.webdriver.support import expected_conditions as EC
|
||||
|
||||
# Exit if no arguments
|
||||
if len(sys.argv)==1: sys.exit("Please choose a film ou serie name")
|
||||
else: arg1 = sys.argv[1]
|
||||
|
||||
search_url = f"https://www2.yggtorrent.se/engine/search?name={arg1}&description=&file=&uploader=&category=all&sub_category=&do=search&order=desc&sort=seed"
|
||||
|
||||
# Load webdriver with Gecko
|
||||
options = webdriver.FirefoxOptions()
|
||||
options.add_argument('-headless')
|
||||
driver = webdriver.Firefox(options=options, executable_path=r'/usr/local/bin/geckodriver')
|
||||
driver.get(search_url)
|
||||
|
||||
# Wait to bypass cloudflare
|
||||
print("Page atteinte, attente de redirection anti-crawling...")
|
||||
wait = WebDriverWait(driver, 10)
|
||||
wait.until(lambda driver: driver.current_url != search_url)
|
||||
|
||||
# Wait 2 seconds to load page
|
||||
print("Anti-crawling passé, affichage dans 2 secondes ...")
|
||||
time.sleep(2)
|
||||
|
||||
# Filter torrent urls
|
||||
elems = driver.find_elements_by_css_selector(".results [href]")
|
||||
links = [elem.get_attribute('href') for elem in elems]
|
||||
links = [k for k in links if '/torrent/' in k]
|
||||
|
||||
# Print torrents urls
|
||||
print("\n".join(links))
|
||||
|
||||
|
||||
driver.quit()
|
|
@ -44,7 +44,7 @@ YGGTORRENT_SEARCH_URL_DO = "&do="
|
|||
YGGTORRENT_SEARCH_URL_PAGE = "&page="
|
||||
|
||||
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
||||
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
|
||||
YGGTORRENT_GET_INFO = f"https://www2.yggtorrent.se/engine/get_nfo?torrent="
|
||||
|
||||
YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"
|
||||
|
||||
|
@ -52,6 +52,7 @@ TORRENT_PER_PAGE = 50
|
|||
|
||||
YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
||||
|
||||
headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
|
||||
|
||||
def set_yggtorrent_tld(yggtorrent_tld=None):
|
||||
"""
|
||||
|
@ -77,7 +78,7 @@ def set_yggtorrent_tld(yggtorrent_tld=None):
|
|||
|
||||
YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="
|
||||
|
||||
YGGTORRENT_DOMAIN = ".yggtorrent.gg"
|
||||
YGGTORRENT_DOMAIN = ".yggtorrent.se"
|
||||
|
||||
YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
|
||||
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="
|
||||
|
@ -108,7 +109,7 @@ class YggTorrentScraper:
|
|||
"User-Agent": "PostmanRuntime/7.17.1",
|
||||
"Accept": "*/*",
|
||||
"Cache-Control": "no-cache",
|
||||
"Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
|
||||
"Host": f"www2.yggtorrent.{YGGTORRENT_TLD}",
|
||||
"Accept-Encoding": "gzip, deflate",
|
||||
"Connection": "keep-alive",
|
||||
}
|
||||
|
@ -145,7 +146,7 @@ class YggTorrentScraper:
|
|||
"""
|
||||
Logout request
|
||||
"""
|
||||
response = self.session.get(YGGTORRENT_LOGOUT_URL)
|
||||
response = self.session.get(YGGTORRENT_LOGOUT_URL, headers=headers)
|
||||
|
||||
self.session.cookies.clear()
|
||||
|
||||
|
@ -160,12 +161,18 @@ class YggTorrentScraper:
|
|||
|
||||
return False
|
||||
|
||||
def search(self, parameters):
|
||||
#kopa
|
||||
def search_old(self, parameters):
|
||||
search_url = create_search_url(parameters)
|
||||
torrents_url = self.get_torrents_url(search_url, parameters)
|
||||
|
||||
return torrents_url
|
||||
|
||||
def search(self, parameters):
|
||||
# torrents_url = os.popen('gecko/torrent_search.py didier')
|
||||
torrents_url = exec(open('/home/iptubes/astroport-iptubes/yggcrawl/gecko/torrent_search.py').read())
|
||||
return torrents_url
|
||||
|
||||
def extract_details(self, torrent_url):
|
||||
"""
|
||||
Extract informations from torrent's url
|
||||
|
@ -174,7 +181,7 @@ class YggTorrentScraper:
|
|||
|
||||
torrents = []
|
||||
|
||||
response = self.session.get(torrent_url)
|
||||
response = self.session.get(torrent_url, headers=headers)
|
||||
|
||||
torrent_page = BeautifulSoup(response.content, features="lxml")
|
||||
|
||||
|
@ -237,7 +244,7 @@ class YggTorrentScraper:
|
|||
"input", {"type": "hidden", "name": "target"}
|
||||
)["value"]
|
||||
|
||||
response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)
|
||||
response = self.session.get(YGGTORRENT_GET_FILES + torrent_id, headers=headers)
|
||||
|
||||
files_page = BeautifulSoup(response.content, features="lxml")
|
||||
|
||||
|
@ -292,12 +299,12 @@ class YggTorrentScraper:
|
|||
|
||||
return torrents_url
|
||||
|
||||
#kopaa
|
||||
def get_torrents_url(self, search_url, parameters):
|
||||
"""
|
||||
Return
|
||||
"""
|
||||
|
||||
response = self.session.get(search_url)
|
||||
response = self.session.get(search_url, headers=headers)
|
||||
|
||||
search_page = BeautifulSoup(response.content, features="lxml")
|
||||
|
||||
|
@ -317,7 +324,7 @@ class YggTorrentScraper:
|
|||
|
||||
search_url = create_search_url(parameters)
|
||||
|
||||
response = self.session.get(search_url)
|
||||
response = self.session.get(search_url, headers=headers)
|
||||
|
||||
search_page = BeautifulSoup(response.content, features="lxml")
|
||||
|
||||
|
@ -328,7 +335,6 @@ class YggTorrentScraper:
|
|||
|
||||
return torrents
|
||||
|
||||
#kopa
|
||||
def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
|
||||
if torrent_url is not None:
|
||||
torrent = self.extract_details(torrent_url)
|
||||
|
@ -349,7 +355,7 @@ class YggTorrentScraper:
|
|||
if torrent_url is None:
|
||||
raise Exception("Invalid torrent_url, make sure you are logged")
|
||||
|
||||
response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)
|
||||
response = self.session.get(YGGTORRENT_BASE_URL + torrent_url, headers=headers)
|
||||
|
||||
temp_file_name = response.headers.get("content-disposition")
|
||||
|
||||
|
@ -368,7 +374,6 @@ class YggTorrentScraper:
|
|||
|
||||
return file_full_path
|
||||
|
||||
|
||||
def create_search_url(parameters):
|
||||
"""
|
||||
Return a formated URL for torrent's search
|
||||
|
|
Loading…
Reference in New Issue