astrXbian/.install/.kodi/addons/plugin.video.vstream/resources/lib/parser.py

83 lines
3.6 KiB
Python
Raw Normal View History

2020-12-17 21:52:17 +01:00
# -*- coding: utf-8 -*-
# vStream https://github.com/Kodi-vStream/venom-xbmc-addons
import re
class cParser:
def parseSingleResult(self, sHtmlContent, sPattern):
aMatches = re.compile(sPattern).findall(sHtmlContent)
if (len(aMatches) == 1):
aMatches[0] = self.__replaceSpecialCharacters(aMatches[0])
return True, aMatches[0]
return False, aMatches
def __replaceSpecialCharacters(self, sString):
""" /!\ pas les mêmes tirets, tiret moyen et cadratin."""
return sString.replace('\r', '').replace('\n', '').replace('\t', '').replace('\\/', '/').replace('&', '&')\
.replace(''', "'").replace('–', '-').replace('—', '-').replace('é', 'é')\
.replace('â', 'â').replace('ê', 'ê').replace('î', 'î').replace('ô', 'ô')\
.replace('…', '...').replace('"', '"').replace('>', '>').replace('è', 'è')\
.replace('&ccedil;', 'ç').replace('&laquo;', '<<').replace('&raquo;', '>>').replace('\xc9', 'E')\
.replace('&ndash;', '-').replace('&eacute;', 'é').replace('&agrave;', 'à').replace('&lt;', '<')\
.replace('&rsquo;', "'").replace('&lsquo;', '\'').replace('&nbsp;', '').replace('&#8217;', "'")\
.replace('&#8230;', '...').replace('&#8242;', "'").replace('&#884;', '\'')\
.replace('&#038;', '&').replace('', '-').replace('', '-')
def parse(self, sHtmlContent, sPattern, iMinFoundValue=1):
sHtmlContent = self.__replaceSpecialCharacters(str(sHtmlContent))
aMatches = re.compile(sPattern, re.IGNORECASE).findall(sHtmlContent)
# extrait la page html après retraitement vStream
# fh = open('c:\\test.txt', "w")
# fh.write(sHtmlContent)
# fh.close()
if (len(aMatches) >= iMinFoundValue):
return True, aMatches
return False, aMatches
def replace(self, sPattern, sReplaceString, sValue):
return re.sub(sPattern, sReplaceString, sValue)
def escape(self, sValue):
return re.escape(sValue)
def getNumberFromString(self, sValue):
if '/0-9/' in sValue:
sPattern = '/0-9.+?(\d+)'
else:
sPattern = '\d+'
aMatches = re.findall(sPattern, sValue)
if (len(aMatches) > 0):
return aMatches[0]
return 0
def titleParse(self, sHtmlContent, sPattern):
sHtmlContent = self.__replaceSpecialCharacters(str(sHtmlContent))
aMatches = re.compile(sPattern, re.IGNORECASE)
try:
[m.groupdict() for m in aMatches.finditer(sHtmlContent)]
return m.groupdict()
except:
return {'title': sHtmlContent}
def abParse(self, sHtmlContent, start, end = None, startoffset=0):
# usage oParser.abParse(sHtmlContent, 'start', 'end')
# startoffset (int) décale le début pour ne pas prendre en compte start dans le résultat final si besoin
# la fin est recherchée forcement après le début
# la recherche de fin n'est pas obligatoire
# usage2 oParser.abParse(sHtmlContent, 'start', 'end', 6)
# ex youtube.py
startIdx = sHtmlContent.find(start)
if startIdx == -1 : # rien trouvé, retourner le texte complet
return sHtmlContent
if end:
endIdx = sHtmlContent[startoffset + startIdx : ].find(end)
if endIdx>0:
return sHtmlContent[startoffset + startIdx : startoffset + startIdx + endIdx]
return sHtmlContent[startoffset + startIdx :]