astroport-iptubes/yggcrawl/yggtorrentscraper.py

import datetime
import logging
import os
import re

import requests
from bs4 import BeautifulSoup

from .torrent import Torrent, TorrentComment, TorrentFile
from .categories import categories

YGGTORRENT_TLD = "se"

YGGTORRENT_BASE_URL = f"https://www2.yggtorrent.{YGGTORRENT_TLD}"

YGGTORRENT_LOGIN_URL = f"{YGGTORRENT_BASE_URL}/user/login"
YGGTORRENT_LOGOUT_URL = f"{YGGTORRENT_BASE_URL}/user/logout?attempt=1"

YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="

logger = logging.getLogger("yggtorrentscraper")

YGGTORRENT_DOMAIN = f".yggtorrent.{YGGTORRENT_TLD}"
YGGTORRENT_TOKEN_COOKIE = "ygg_"

YGGTORRENT_SEARCH_URL_DESCRIPTION = "&description="
YGGTORRENT_SEARCH_URL_FILE = "&file="
YGGTORRENT_SEARCH_URL_UPLOADER = "&uploader="
YGGTORRENT_SEARCH_URL_CATEGORY = "&category="
YGGTORRENT_SEARCH_URL_SUB_CATEGORY = "&sub_category="
YGGTORRENT_SEARCH_URL_ORDER = "&order="
YGGTORRENT_SEARCH_URL_SORT = "&sort="
YGGTORRENT_SEARCH_URL_DO = "&do="
YGGTORRENT_SEARCH_URL_PAGE = "&page="

YGGTORRENT_SEARCH_URL_DESCRIPTION = "&description="
YGGTORRENT_SEARCH_URL_FILE = "&file="
YGGTORRENT_SEARCH_URL_UPLOADER = "&uploader="
YGGTORRENT_SEARCH_URL_CATEGORY = "&category="
YGGTORRENT_SEARCH_URL_SUB_CATEGORY = "&sub_category="
YGGTORRENT_SEARCH_URL_ORDER = "&order="
YGGTORRENT_SEARCH_URL_SORT = "&sort="
YGGTORRENT_SEARCH_URL_DO = "&do="
YGGTORRENT_SEARCH_URL_PAGE = "&page="

YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="

YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"

TORRENT_PER_PAGE = 50

YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="


def set_yggtorrent_tld(yggtorrent_tld=None):
    """
    Redefine all string variable according to new TLD
    """

    global YGGTORRENT_TLD
    global YGGTORRENT_BASE_URL
    global YGGTORRENT_LOGIN_URL
    global YGGTORRENT_SEARCH_URL
    global YGGTORRENT_DOMAIN
    global YGGTORRENT_GET_FILES
    global YGGTORRENT_GET_INFO
    global YGGTORRENT_MOST_COMPLETED_URL
    global YGGTORRENT_FILES_URL

    YGGTORRENT_TLD = yggtorrent_tld

    YGGTORRENT_BASE_URL = f"https://www2.yggtorrent.{YGGTORRENT_TLD}"

    YGGTORRENT_LOGIN_URL = f"{YGGTORRENT_BASE_URL}/user/login"
    YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/user/logout"

    YGGTORRENT_SEARCH_URL = f"{YGGTORRENT_BASE_URL}/engine/search?name="

    YGGTORRENT_DOMAIN = ".yggtorrent.gg"

    YGGTORRENT_GET_FILES = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="
    YGGTORRENT_GET_INFO = f"https://www2.yggtorrentchg/engine/get_nfo?torrent="

    YGGTORRENT_MOST_COMPLETED_URL = f"{YGGTORRENT_BASE_URL}/engine/mostcompleted"

    YGGTORRENT_FILES_URL = f"{YGGTORRENT_BASE_URL}/engine/get_files?torrent="


def get_yggtorrent_tld():
    return YGGTORRENT_TLD


class YggTorrentScraper:
    session = None

    def __init__(self, session):
        self.session = session

    def login(self, identifiant, password):
        """
        Login request with the specified identifiant and password, return an yggtorrent_token, necessary to download
        """
        self.session.cookies.clear()

        headers = {
            "Content-Type": "application/x-www-form-urlencoded",
            "User-Agent": "PostmanRuntime/7.17.1",
            "Accept": "*/*",
            "Cache-Control": "no-cache",
            "Host": f"www.yggtorrent.{YGGTORRENT_TLD}",
            "Accept-Encoding": "gzip, deflate",
            "Connection": "keep-alive",
        }

        response = self.session.post(
            YGGTORRENT_LOGIN_URL,
            data={"id": identifiant, "pass": password},
            headers=headers,
        )

        logger.debug("status_code : %s", response.status_code)

        yggtorrent_token = None

        if response.status_code == 200:
            logger.debug("Login successful")
            yggtorrent_token = response.cookies.get_dict()[YGGTORRENT_TOKEN_COOKIE]

            cookie = requests.cookies.create_cookie(
                domain=YGGTORRENT_DOMAIN,
                name=YGGTORRENT_TOKEN_COOKIE,
                value=yggtorrent_token,
            )

            self.session.cookies.set_cookie(cookie)

            return True
        else:
            logger.debug("Login failed")

            return False

    def logout(self):
        """
        Logout request
        """
        response = self.session.get(YGGTORRENT_LOGOUT_URL)

        self.session.cookies.clear()

        logger.debug("status_code : %s", response.status_code)

        if response.status_code == 200:
            logger.debug("Logout successful")

            return True
        else:
            logger.debug("Logout failed")

            return False

    def search(self, parameters):
        search_url = create_search_url(parameters)
        torrents_url = self.get_torrents_url(search_url, parameters)

        return torrents_url

    def extract_details(self, torrent_url):
        """
        Extract informations from torrent's url
        """
        logger.debug("torrent_url : %s", torrent_url)

        torrents = []

        response = self.session.get(torrent_url)

        torrent_page = BeautifulSoup(response.content, features="lxml")

        torrent = Torrent()

        term_tags = torrent_page.find_all("a", {"class": "term"})

        for term_tag in term_tags:
            torrent.keywords.append(term_tag.text)

        connection_tags = torrent_page.find("tr", {"id": "adv_search_cat"}).find_all(
            "strong"
        )

        informations_tag = (
            torrent_page.find("table", {"class": "informations"})
            .find("tbody")
            .find_all("tr")
        )

        download_button = torrent_page.find("a", {"class": "butt"})

        if download_button.has_attr("href"):
            torrent.url = download_button["href"]

        torrent.seeders = int(connection_tags[0].text.replace(" ", ""))
        torrent.leechers = int(connection_tags[1].text.replace(" ", ""))
        torrent.completed = int(connection_tags[2].text.replace(" ", ""))

        torrent.name = informations_tag[0].find_all("td")[1].text
        torrent.size = informations_tag[3].find_all("td")[1].text
        torrent.uploader = informations_tag[5].find_all("td")[1].text

        mydatetime = re.search(
            "([0-9]*\/[0-9]*\/[0-9]* [0-9]*:[0-9]*)",
            informations_tag[6].find_all("td")[1].text,
            0,
        ).group(0)

        torrent.uploaded_datetime = datetime.datetime.strptime(
            mydatetime, "%d/%m/%Y %H:%M"
        )

        message_tags = torrent_page.find_all("div", {"class": "message"})

        for message_tag in message_tags:
            torrent_comment = TorrentComment()

            torrent_comment.author = message_tag.find("a").text
            torrent_comment.posted = message_tag.find("strong").text
            torrent_comment.text = message_tag.find(
                "span", {"id": "comment_text"}
            ).text.strip()

            torrent.comments.append(torrent_comment)

        torrents.append(torrent)

        torrent_id = torrent_page.find("form", {"id": "report-torrent"}).find(
            "input", {"type": "hidden", "name": "target"}
        )["value"]

        response = self.session.get(YGGTORRENT_GET_FILES + torrent_id)

        files_page = BeautifulSoup(response.content, features="lxml")

        file_tags = files_page.find_all("tr")

        for file_tag in file_tags:
            torrent_file = TorrentFile()

            td_tags = file_tag.find_all("td")

            torrent_file.file_size = (
                td_tags[0]
                .text.replace("\\r", "")
                .replace("\\n", "")
                .replace("\\t", "")
                .strip()
            )
            torrent_file.file_name = (
                td_tags[1]
                .text.replace("\\r", "")
                .replace("\\n", "")
                .replace("\\t", "")
                .replace("\\", "")
                .replace(" ", "")
                .strip()
            )

            torrent.files.append(torrent_file)

        return torrent

    def most_completed(self):
        """
        Return the most completed torrents url (TOP 100)
        """

        header = {"Accept": "application/json, text/javascript, */*; q=0.01"}
        self.session.post(YGGTORRENT_MOST_COMPLETED_URL, headers=header)

        json_response = self.session.post(
            YGGTORRENT_MOST_COMPLETED_URL, headers=header
        ).json()

        torrents_url = []

        for json_item in json_response:
            root = BeautifulSoup(json_item[1], features="lxml")

            a_tag = root.find("a")

            torrents_url.append(a_tag["href"])

        return torrents_url

    def get_torrents_url(self, search_url, parameters):
        """
        Return
        """

        response = self.session.get(search_url)

        search_page = BeautifulSoup(response.content, features="lxml")

        pagination = search_page.find("ul", {"class": "pagination"})

        if pagination is None:
            limit_page = 1
        else:
            pagination_item = pagination.find_all("a")

            limit_page = int(pagination_item[-1]["data-ci-pagination-page"])

        torrents = []

        for page in range(0, limit_page):
            parameters["page"] = page * TORRENT_PER_PAGE

            search_url = create_search_url(parameters)

            response = self.session.get(search_url)

            search_page = BeautifulSoup(response.content, features="lxml")

            torrents_tag = search_page.findAll("a", {"id": "torrent_name"})

            for torrent_tag in torrents_tag:
                torrents.append(torrent_tag["href"])

        return torrents

#kopa
    def download_from_torrent_url(self, torrent_url=None, destination_path="./data/tmp/torrents/"):
        if torrent_url is not None:
            torrent = self.extract_details(torrent_url)

            return self.download_from_torrent_download_url(
                torrent_url=torrent.url, destination_path=destination_path
            )

    def download_from_torrent(self, torrent=None, destination_path="./data/tmp/torrents/"):
        if torrent is not None:
            return self.download_from_torrent_download_url(
                torrent_url=torrent.url, destination_path=destination_path
            )

    def download_from_torrent_download_url(
        self, torrent_url=None, destination_path="./data/tmp/torrents/"
    ):
        if torrent_url is None:
            raise Exception("Invalid torrent_url, make sure you are logged")

        response = self.session.get(YGGTORRENT_BASE_URL + torrent_url)

        temp_file_name = response.headers.get("content-disposition")

        file_name = temp_file_name[temp_file_name.index("filename=") + 10 : -1]

        if not os.path.exists(destination_path):
            os.makedirs(destination_path)

        file_full_path = os.path.join(destination_path, file_name)

        file = open(file_full_path, "wb")

        file.write(response.content)

        file.close()

        return file_full_path


def create_search_url(parameters):
    """
    Return a formated URL for torrent's search
    """

    formated_search_url = YGGTORRENT_SEARCH_URL

    if "name" in parameters:
        formated_search_url += parameters["name"].replace(" ", "+")

    if "page" in parameters:
        formated_search_url += YGGTORRENT_SEARCH_URL_PAGE
        formated_search_url += str(parameters["page"])

    if "descriptions" in parameters:
        formated_search_url += YGGTORRENT_SEARCH_URL_DESCRIPTION

        for description in parameters["descriptions"]:
            formated_search_url += description
            formated_search_url += "+"

    if "files" in parameters:
        formated_search_url += YGGTORRENT_SEARCH_URL_FILE

        for file in parameters["files"]:
            formated_search_url += file
            formated_search_url += "+"

    if "uploader" in parameters:
        formated_search_url += YGGTORRENT_SEARCH_URL_UPLOADER
        formated_search_url += parameters["uploader"]

    if "sort" in parameters:
        formated_search_url += YGGTORRENT_SEARCH_URL_SORT
        formated_search_url += parameters["sort"]

    if "order" in parameters:
        formated_search_url += YGGTORRENT_SEARCH_URL_ORDER
        formated_search_url += parameters["order"]

    if "category" in parameters:
        for category in categories:
            if parameters["category"] == category["name"]:
                formated_search_url += YGGTORRENT_SEARCH_URL_CATEGORY
                formated_search_url += category["id"]

                if "subcategory" in parameters:
                    for subcategory in category["subcategories"]:
                        if parameters["subcategory"] == subcategory["name"]:
                            formated_search_url += YGGTORRENT_SEARCH_URL_SUB_CATEGORY
                            formated_search_url += subcategory["id"]
                            if "options" in parameters:
                                for key, values in parameters["options"].items():
                                    for option in subcategory["options"]:
                                        if key == option["name"]:
                                            for searched_value in values:
                                                for index, value in enumerate(
                                                    option["values"]
                                                ):
                                                    if searched_value == value:
                                                        formated_search_url += (
                                                            "&option_"
                                                        )
                                                        formated_search_url += option[
                                                            "name"
                                                        ]
                                                        # options_index.append(index)
                                                        if "multiple" in option:
                                                            formated_search_url += (
                                                                "%3Amultiple"
                                                            )

                                                        formated_search_url += "[]="
                                                        formated_search_url += str(
                                                            index + 1
                                                        )

    formated_search_url += YGGTORRENT_SEARCH_URL_DO
    formated_search_url += "search"

    return formated_search_url