forked from axiom-team/astrXbian
83 lines
3.7 KiB
Python
83 lines
3.7 KiB
Python
|
# -*- coding: utf-8 -*-
|
|||
|
# vStream https://github.com/Kodi-vStream/venom-xbmc-addons
|
|||
|
|
|||
|
import re
|
|||
|
|
|||
|
|
|||
|
class cParser:
|
|||
|
|
|||
|
def parseSingleResult(self, sHtmlContent, sPattern):
|
|||
|
aMatches = re.compile(sPattern).findall(sHtmlContent)
|
|||
|
if (len(aMatches) == 1):
|
|||
|
aMatches[0] = self.__replaceSpecialCharacters(aMatches[0])
|
|||
|
return True, aMatches[0]
|
|||
|
return False, aMatches
|
|||
|
|
|||
|
def __replaceSpecialCharacters(self, sString):
|
|||
|
""" /!\ pas les mêmes tirets, tiret moyen et cadratin."""
|
|||
|
return sString.replace('\r', '').replace('\n', '').replace('\t', '').replace('\\/', '/').replace('&', '&')\
|
|||
|
.replace(''', "'").replace('–', '-').replace('—', '-').replace('é', 'é')\
|
|||
|
.replace('â', 'â').replace('ê', 'ê').replace('î', 'î').replace('ô', 'ô')\
|
|||
|
.replace('…', '...').replace('"', '"').replace('>', '>').replace('è', 'è')\
|
|||
|
.replace('ç', 'ç').replace('«', '<<').replace('»', '>>').replace('\xc9', 'E')\
|
|||
|
.replace('–', '-').replace('é', 'é').replace('à', 'à').replace('<', '<')\
|
|||
|
.replace('’', "'").replace('‘', '\'').replace(' ', '').replace('’', "'")\
|
|||
|
.replace('…', '...').replace('′', "'").replace('ʹ', '\'')\
|
|||
|
.replace('&', '&').replace('–', '-').replace('—', '-')
|
|||
|
|
|||
|
def parse(self, sHtmlContent, sPattern, iMinFoundValue=1):
|
|||
|
sHtmlContent = self.__replaceSpecialCharacters(str(sHtmlContent))
|
|||
|
aMatches = re.compile(sPattern, re.IGNORECASE).findall(sHtmlContent)
|
|||
|
|
|||
|
# extrait la page html après retraitement vStream
|
|||
|
# fh = open('c:\\test.txt', "w")
|
|||
|
# fh.write(sHtmlContent)
|
|||
|
# fh.close()
|
|||
|
|
|||
|
if (len(aMatches) >= iMinFoundValue):
|
|||
|
return True, aMatches
|
|||
|
return False, aMatches
|
|||
|
|
|||
|
def replace(self, sPattern, sReplaceString, sValue):
|
|||
|
return re.sub(sPattern, sReplaceString, sValue)
|
|||
|
|
|||
|
def escape(self, sValue):
|
|||
|
return re.escape(sValue)
|
|||
|
|
|||
|
def getNumberFromString(self, sValue):
|
|||
|
if '/0-9/' in sValue:
|
|||
|
sPattern = '/0-9.+?(\d+)'
|
|||
|
else:
|
|||
|
sPattern = '\d+'
|
|||
|
aMatches = re.findall(sPattern, sValue)
|
|||
|
if (len(aMatches) > 0):
|
|||
|
return aMatches[0]
|
|||
|
return 0
|
|||
|
|
|||
|
def titleParse(self, sHtmlContent, sPattern):
|
|||
|
sHtmlContent = self.__replaceSpecialCharacters(str(sHtmlContent))
|
|||
|
aMatches = re.compile(sPattern, re.IGNORECASE)
|
|||
|
try:
|
|||
|
[m.groupdict() for m in aMatches.finditer(sHtmlContent)]
|
|||
|
return m.groupdict()
|
|||
|
except:
|
|||
|
return {'title': sHtmlContent}
|
|||
|
|
|||
|
def abParse(self, sHtmlContent, start, end = None, startoffset=0):
|
|||
|
# usage oParser.abParse(sHtmlContent, 'start', 'end')
|
|||
|
# startoffset (int) décale le début pour ne pas prendre en compte start dans le résultat final si besoin
|
|||
|
# la fin est recherchée forcement après le début
|
|||
|
# la recherche de fin n'est pas obligatoire
|
|||
|
# usage2 oParser.abParse(sHtmlContent, 'start', 'end', 6)
|
|||
|
# ex youtube.py
|
|||
|
|
|||
|
startIdx = sHtmlContent.find(start)
|
|||
|
if startIdx == -1 : # rien trouvé, retourner le texte complet
|
|||
|
return sHtmlContent
|
|||
|
|
|||
|
if end:
|
|||
|
endIdx = sHtmlContent[startoffset + startIdx : ].find(end)
|
|||
|
if endIdx>0:
|
|||
|
return sHtmlContent[startoffset + startIdx : startoffset + startIdx + endIdx]
|
|||
|
return sHtmlContent[startoffset + startIdx :]
|