astrXbian/.install/.kodi/addons/script.module.parsedom/lib/CommonFunctions.py

'''
   Parsedom for XBMC plugins
   Copyright (C) 2010-2011 Tobias Ussing And Henrik Mosgaard Jensen

   This program is free software: you can redistribute it and/or modify
   it under the terms of the GNU General Public License as published by
   the Free Software Foundation, either version 3 of the License, or
   (at your option) any later version.

   This program is distributed in the hope that it will be useful,
   but WITHOUT ANY WARRANTY; without even the implied warranty of
   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
   GNU General Public License for more details.

   You should have received a copy of the GNU General Public License
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
'''

import sys
import urllib
import urllib2
import re
import io
import inspect
import time
import HTMLParser
#import chardet
import json

version = u"2.5.1"
plugin = u"CommonFunctions-" + version
print plugin

USERAGENT = u"Mozilla/5.0 (Windows NT 6.2; Win64; x64; rv:16.0.1) Gecko/20121011 Firefox/16.0.1"

if hasattr(sys.modules["__main__"], "xbmc"):
    xbmc = sys.modules["__main__"].xbmc
else:
    import xbmc

if hasattr(sys.modules["__main__"], "xbmcgui"):
    xbmcgui = sys.modules["__main__"].xbmcgui
else:
    import xbmcgui

if hasattr(sys.modules["__main__"], "dbg"):
    dbg = sys.modules["__main__"].dbg
else:
    dbg = False

if hasattr(sys.modules["__main__"], "dbglevel"):
    dbglevel = sys.modules["__main__"].dbglevel
else:
    dbglevel = 3

if hasattr(sys.modules["__main__"], "opener"):
    urllib2.install_opener(sys.modules["__main__"].opener)


# This function raises a keyboard for user input
def getUserInput(title=u"Input", default=u"", hidden=False):
    log("", 5)
    result = None

    # Fix for when this functions is called with default=None
    if not default:
        default = u""

    keyboard = xbmc.Keyboard(default, title)
    keyboard.setHiddenInput(hidden)
    keyboard.doModal()

    if keyboard.isConfirmed():
        result = keyboard.getText()

    log(repr(result), 5)
    return result


# This function raises a keyboard numpad for user input
def getUserInputNumbers(title=u"Input", default=u""):
    log("", 5)
    result = None

    # Fix for when this functions is called with default=None
    if not default:
        default = u""

    keyboard = xbmcgui.Dialog()
    result = keyboard.numeric(0, title, default)

    log(repr(result), 5)
    return str(result)


def getXBMCVersion():
    log("", 3)
    version = xbmc.getInfoLabel( "System.BuildVersion" )
    log(version, 3)
    for key in ["-", " "]:
        if version.find(key) -1:
            version = version[:version.find(key)]
    version = float(version)
    log(repr(version))
    return version

# Converts the request url passed on by xbmc to the plugin into a dict of key-value pairs
def getParameters(parameterString):
    log("", 5)
    commands = {}
    if getXBMCVersion() >= 12.0:
        parameterString = urllib.unquote_plus(parameterString)
    splitCommands = parameterString[parameterString.find('?') + 1:].split('&')

    for command in splitCommands:
        if (len(command) > 0):
            splitCommand = command.split('=')
            key = splitCommand[0]
            try:
                value = splitCommand[1].encode("utf-8")
            except:
                log("Error utf-8 encoding argument value: " + repr(splitCommand[1]))
                value = splitCommand[1]

            commands[key] = value

    log(repr(commands), 5)
    return commands


def replaceHTMLCodes(txt):
    log(repr(txt), 5)

    # Fix missing ; in &#<number>;
    txt = re.sub("(&#[0-9]+)([^;^0-9]+)", "\\1;\\2", makeUTF8(txt))

    txt = HTMLParser.HTMLParser().unescape(txt)
    txt = txt.replace("&amp;", "&")
    log(repr(txt), 5)
    return txt


def stripTags(html):
    log(repr(html), 5)
    sub_start = html.find("<")
    sub_end = html.find(">")
    while sub_start < sub_end and sub_start > -1:
        html = html.replace(html[sub_start:sub_end + 1], "").strip()
        sub_start = html.find("<")
        sub_end = html.find(">")

    log(repr(html), 5)
    return html


def _getDOMContent(html, name, match, ret):  # Cleanup
    log("match: " + match, 3)

    endstr = u"</" + name  # + ">"

    start = html.find(match)
    end = html.find(endstr, start)
    pos = html.find("<" + name, start + 1 )

    log(str(start) + " < " + str(end) + ", pos = " + str(pos) + ", endpos: " + str(end), 8)

    while pos < end and pos != -1:  # Ignore too early </endstr> return
        tend = html.find(endstr, end + len(endstr))
        if tend != -1:
            end = tend
        pos = html.find("<" + name, pos + 1)
        log("loop: " + str(start) + " < " + str(end) + " pos = " + str(pos), 8)

    log("start: %s, len: %s, end: %s" % (start, len(match), end), 3)
    if start == -1 and end == -1:
        result = u""
    elif start > -1 and end > -1:
        result = html[start + len(match):end]
    elif end > -1:
        result = html[:end]
    elif start > -1:
        result = html[start + len(match):]

    if ret:
        endstr = html[end:html.find(">", html.find(endstr)) + 1]
        result = match + result + endstr

    log("done result length: " + str(len(result)), 3)
    return result

def _getDOMAttributes(match, name, ret):
    log("", 3)

    lst = re.compile('<' + name + '.*?' + ret + '=([\'"].[^>]*?[\'"])>', re.M | re.S).findall(match)
    if len(lst) == 0:
        lst = re.compile('<' + name + '.*?' + ret + '=(.[^>]*?)>', re.M | re.S).findall(match)
    ret = []
    for tmp in lst:
        cont_char = tmp[0]
        if cont_char in "'\"":
            log("Using %s as quotation mark" % cont_char, 3)

            # Limit down to next variable.
            if tmp.find('=' + cont_char, tmp.find(cont_char, 1)) > -1:
                tmp = tmp[:tmp.find('=' + cont_char, tmp.find(cont_char, 1))]

            # Limit to the last quotation mark
            if tmp.rfind(cont_char, 1) > -1:
                tmp = tmp[1:tmp.rfind(cont_char)]
        else:
            log("No quotation mark found", 3)
            if tmp.find(" ") > 0:
                tmp = tmp[:tmp.find(" ")]
            elif tmp.find("/") > 0:
                tmp = tmp[:tmp.find("/")]
            elif tmp.find(">") > 0:
                tmp = tmp[:tmp.find(">")]

        ret.append(tmp.strip())

    log("Done: " + repr(ret), 3)
    return ret

def _getDOMElements(item, name, attrs):
    log("", 3)

    lst = []
    for key in attrs:
        lst2 = re.compile('(<' + name + '[^>]*?(?:' + key + '=[\'"]' + attrs[key] + '[\'"].*?>))', re.M | re.S).findall(item)
        if len(lst2) == 0 and attrs[key].find(" ") == -1:  # Try matching without quotation marks
            lst2 = re.compile('(<' + name + '[^>]*?(?:' + key + '=' + attrs[key] + '.*?>))', re.M | re.S).findall(item)

        if len(lst) == 0:
            log("Setting main list " + repr(lst2), 5)
            lst = lst2
            lst2 = []
        else:
            log("Setting new list " + repr(lst2), 5)
            test = range(len(lst))
            test.reverse()
            for i in test:  # Delete anything missing from the next list.
                if not lst[i] in lst2:
                    log("Purging mismatch " + str(len(lst)) + " - " + repr(lst[i]), 3)
                    del(lst[i])

    if len(lst) == 0 and attrs == {}:
        log("No list found, trying to match on name only", 3)
        lst = re.compile('(<' + name + '>)', re.M | re.S).findall(item)
        if len(lst) == 0:
            lst = re.compile('(<' + name + ' .*?>)', re.M | re.S).findall(item)

    log("Done: " + str(type(lst)), 3)
    return lst

def parseDOM(html, name=u"", attrs={}, ret=False):
    log("Name: " + repr(name) + " - Attrs:" + repr(attrs) + " - Ret: " + repr(ret) + " - HTML: " + str(type(html)), 3)

    if isinstance(name, str): # Should be handled
        try:
            name = name #.decode("utf-8")
        except:
            log("Couldn't decode name binary string: " + repr(name))

    if isinstance(html, str):
        try:
            html = [html.decode("utf-8")] # Replace with chardet thingy
        except:
            log("Couldn't decode html binary string. Data length: " + repr(len(html)))
            html = [html]
    elif isinstance(html, unicode):
        html = [html]
    elif not isinstance(html, list):
        log("Input isn't list or string/unicode.")
        return u""

    if not name.strip():
        log("Missing tag name")
        return u""

    ret_lst = []
    for item in html:
        temp_item = re.compile('(<[^>]*?\n[^>]*?>)').findall(item)
        for match in temp_item:
            item = item.replace(match, match.replace("\n", " "))

        lst = _getDOMElements(item, name, attrs)

        if isinstance(ret, str):
            log("Getting attribute %s content for %s matches " % (ret, len(lst) ), 3)
            lst2 = []
            for match in lst:
                lst2 += _getDOMAttributes(match, name, ret)
            lst = lst2
        else:
            log("Getting element content for %s matches " % len(lst), 3)
            lst2 = []
            for match in lst:
                log("Getting element content for %s" % match, 4)
                temp = _getDOMContent(item, name, match, ret).strip()
                item = item[item.find(temp, item.find(match)) + len(temp):]
                lst2.append(temp)
            lst = lst2
        ret_lst += lst

    log("Done: " + repr(ret_lst), 3)
    return ret_lst


def extractJS(data, function=False, variable=False, match=False, evaluate=False, values=False):
    log("")
    scripts = parseDOM(data, "script")
    if len(scripts) == 0:
        log("Couldn't find any script tags. Assuming javascript file was given.")
        scripts = [data]

    lst = []
    log("Extracting", 4)
    for script in scripts:
        tmp_lst = []
        if function:
            tmp_lst = re.compile(function + '\(.*?\).*?;', re.M | re.S).findall(script)
        elif variable:
            tmp_lst = re.compile(variable + '[ ]+=.*?;', re.M | re.S).findall(script)
        else:
            tmp_lst = [script]
        if len(tmp_lst) > 0:
            log("Found: " + repr(tmp_lst), 4)
            lst += tmp_lst
        else:
            log("Found nothing on: " + script, 4)

    test = range(0, len(lst))
    test.reverse()
    for i in test:
        if match and lst[i].find(match) == -1:
            log("Removing item: " + repr(lst[i]), 10)
            del lst[i]
        else:
            log("Cleaning item: " + repr(lst[i]), 4)
            if lst[i][0] == u"\n":
                lst[i] == lst[i][1:]
            if lst[i][len(lst) -1] == u"\n":
                lst[i] == lst[i][:len(lst)- 2]
            lst[i] = lst[i].strip()

    if values or evaluate:
        for i in range(0, len(lst)):
            log("Getting values %s" % lst[i])
            if function:
                if evaluate: # include the ( ) for evaluation
                    data = re.compile("(\(.*?\))", re.M | re.S).findall(lst[i])
                else:
                    data = re.compile("\((.*?)\)", re.M | re.S).findall(lst[i])
            elif variable:
                tlst = re.compile(variable +".*?=.*?;", re.M | re.S).findall(lst[i])
                data = []
                for tmp in tlst: # This breaks for some stuff. "ad_tag": "http://ad-emea.doubleclick.net/N4061/pfadx/com.ytpwatch.entertainment/main_563326'' # ends early, must end with }
                    cont_char = tmp[0]
                    cont_char = tmp[tmp.find("=") + 1:].strip()
                    cont_char = cont_char[0]
                    if cont_char in "'\"":
                        log("Using %s as quotation mark" % cont_char, 1)
                        tmp = tmp[tmp.find(cont_char) + 1:tmp.rfind(cont_char)]
                    else:
                        log("No quotation mark found", 1)
                        tmp = tmp[tmp.find("=") + 1: tmp.rfind(";")]

                    tmp = tmp.strip()
                    if len(tmp) > 0:
                        data.append(tmp)
            else:
                log("ERROR: Don't know what to extract values from")

            log("Values extracted: %s" % repr(data))
            if len(data) > 0:
                lst[i] = data[0]

    if evaluate:
        for i in range(0, len(lst)):
            log("Evaluating %s" % lst[i])
            data = lst[i].strip()
            try:
                try:
                    lst[i] = json.loads(data)
                except:
                    log("Couldn't json.loads, trying eval")
                    lst[i] = eval(data)
            except:
                log("Couldn't eval: %s from %s" % (repr(data), repr(lst[i])))

    log("Done: " + str(len(lst)))
    return lst

def fetchPage(params={}):
    get = params.get
    link = get("link")
    ret_obj = {}
    if get("post_data"):
        log("called for : " + repr(params['link']))
    else:
        log("called for : " + repr(params))

    if not link or int(get("error", "0")) > 2:
        log("giving up")
        ret_obj["status"] = 500
        return ret_obj

    if get("post_data"):
        if get("hide_post_data"):
            log("Posting data", 2)
        else:
            log("Posting data: " + urllib.urlencode(get("post_data")), 2)

        request = urllib2.Request(link, urllib.urlencode(get("post_data")))
        request.add_header('Content-Type', 'application/x-www-form-urlencoded')
    else:
        log("Got request", 2)
        request = urllib2.Request(link)

    if get("headers"):
        for head in get("headers"):
            request.add_header(head[0], head[1])

    request.add_header('User-Agent', USERAGENT)

    if get("cookie"):
        request.add_header('Cookie', get("cookie"))

    if get("refering"):
        request.add_header('Referer', get("refering"))

    try:
        log("connecting to server...", 1)

        con = urllib2.urlopen(request)
        ret_obj["header"] = con.info()
        ret_obj["new_url"] = con.geturl()
        if get("no-content", "false") == u"false" or get("no-content", "false") == "false":
            inputdata = con.read()
            #data_type = chardet.detect(inputdata)
            #inputdata = inputdata.decode(data_type["encoding"])
            ret_obj["content"] = inputdata.decode("utf-8")

        con.close()

        log("Done")
        ret_obj["status"] = 200
        return ret_obj

    except urllib2.HTTPError, e:
        err = str(e)
        log("HTTPError : " + err)
        log("HTTPError - Headers: " + str(e.headers) + " - Content: " + e.fp.read())

        params["error"] = str(int(get("error", "0")) + 1)
        ret = fetchPage(params)

        if not "content" in ret and e.fp:
            ret["content"] = e.fp.read()
            return ret

        ret_obj["status"] = 500
        return ret_obj

    except urllib2.URLError, e:
        err = str(e)
        log("URLError : " + err)

        time.sleep(3)
        params["error"] = str(int(get("error", "0")) + 1)
        ret_obj = fetchPage(params)
        return ret_obj


def getCookieInfoAsHTML():
    log("", 5)
    if hasattr(sys.modules["__main__"], "cookiejar"):
        cookiejar = sys.modules["__main__"].cookiejar

        cookie = repr(cookiejar)
        cookie = cookie.replace("<_LWPCookieJar.LWPCookieJar[", "")
        cookie = cookie.replace("), Cookie(version=0,", "></cookie><cookie ")
        cookie = cookie.replace(")]>", "></cookie>")
        cookie = cookie.replace("Cookie(version=0,", "<cookie ")
        cookie = cookie.replace(", ", " ")
        log(repr(cookie), 5)
        return cookie

    log("Found no cookie", 5)
    return ""


# This function implements a horrible hack related to python 2.4's terrible unicode handling.
def makeAscii(data):
    log(repr(data), 5)
    #if sys.hexversion >= 0x02050000:
    #        return data

    try:
        return data.encode('ascii', "ignore")
    except:
        log("Hit except on : " + repr(data))
        s = u""
        for i in data:
            try:
                i.encode("ascii", "ignore")
            except:
                log("Can't convert character", 4)
                continue
            else:
                s += i

        log(repr(s), 5)
        return s


# This function handles stupid utf handling in python.
def makeUTF8(data):
    log(repr(data), 5)
    return data
    try:
        return data.decode('utf8', 'xmlcharrefreplace') # was 'ignore'
    except:
        log("Hit except on : " + repr(data))
        s = u""
        for i in data:
            try:
                i.decode("utf8", "xmlcharrefreplace")
            except:
                log("Can't convert character", 4)
                continue
            else:
                s += i
        log(repr(s), 5)
        return s


def openFile(filepath, options=u"r"):
    log(repr(filepath) + " - " + repr(options))
    if options.find("b") == -1:  # Toggle binary mode on failure
        alternate = options + u"b"
    else:
        alternate = options.replace(u"b", u"")

    try:
        log("Trying normal: %s" % options)
        return io.open(filepath, options)
    except:
        log("Fallback to binary: %s" % alternate)
        return io.open(filepath, alternate)


def log(description, level=0):
    if dbg and dbglevel > level:
        try:
            xbmc.log((u"[%s] %s : '%s'" % (plugin, inspect.stack()[1][3], description)).decode("utf-8"), xbmc.LOGNOTICE)
        except:
            xbmc.log(u"FALLBACK [%s] %s : '%s'" % (plugin, inspect.stack()[1][3], repr(description)), xbmc.LOGNOTICE)