""" Based on Parsedom for XBMC plugins Copyright (C) 2010-2011 Tobias Ussing And Henrik Mosgaard Jensen This program is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program. If not, see . """ import re import six from collections import namedtuple DomMatch = namedtuple('DOMMatch', ['attrs', 'content']) re_type = type(re.compile('')) def __get_dom_content(html, name, match): if match.endswith('/>'): return '' # override tag name with tag from match if possible tag = re.match('<([^\s/>]+)', match) if tag: name = tag.group(1) start_str = '<%s' % name end_str = " return tend = html.find(end_str, end + len(end_str)) if tend != -1: end = tend pos = html.find(start_str, pos + 1) if start == -1 and end == -1: result = '' elif start > -1 and end > -1: result = html[start + len(match):end] elif end > -1: result = html[:end] elif start > -1: result = html[start + len(match):] else: result = '' return result def __get_dom_elements(item, name, attrs): if not attrs: pattern = '(<%s(?:\s[^>]*>|/?>))' % name this_list = re.findall(pattern, item, re.M | re.S | re.I) else: last_list = None for key, value in six.iteritems(attrs): value_is_regex = isinstance(value, re_type) value_is_str = isinstance(value, six.string_types) pattern = '''(<{tag}[^>]*\s{key}=(?P['"])(.*?)(?P=delim)[^>]*>)'''.format(tag=name, key=key) re_list = re.findall(pattern, item, re.M | re.S | re.I) if value_is_regex: this_list = [r[0] for r in re_list if re.match(value, r[2])] else: temp_value = [value] if value_is_str else value this_list = [r[0] for r in re_list if set(temp_value) <= set(r[2].split(' '))] if not this_list: has_space = (value_is_regex and ' ' in value.pattern) or (value_is_str and ' ' in value) if not has_space: pattern = '''(<{tag}[^>]*\s{key}=((?:[^\s>]|/>)*)[^>]*>)'''.format(tag=name, key=key) re_list = re.findall(pattern, item, re.M | re.S | re.I) if value_is_regex: this_list = [r[0] for r in re_list if re.match(value, r[1])] else: this_list = [r[0] for r in re_list if value == r[1]] if last_list is None: last_list = this_list else: last_list = [item for item in this_list if item in last_list] this_list = last_list return this_list def __get_attribs(element): attribs = {} for match in re.finditer('''\s+(?P[^=]+)=\s*(?:(?P["'])(?P.*?)(?P=delim)|(?P[^"'][^>\s]*))''', element): match = match.groupdict() value1 = match.get('value1') value2 = match.get('value2') value = value1 if value1 is not None else value2 if value is None: continue attribs[match['key'].lower().strip()] = value return attribs #TC 2/01/19 started def parse_dom(html, name='', attrs=None, req=False, exclude_comments=False): if attrs is None: attrs = {} name = name.strip() if isinstance(html, six.text_type) or isinstance(html, DomMatch): html = [html] elif isinstance(html, six.binary_type) and six.PY2: try: html = [html.decode("utf-8")] # Replace with chardet thingy except: try: html = [html.decode("utf-8", "replace")] except: html = [html] elif not isinstance(html, list): return '' if not name: return '' if not isinstance(attrs, dict): return '' if req: if not isinstance(req, list): req = [req] req = set([key.lower() for key in req]) all_results = [] for item in html: if isinstance(item, DomMatch): item = item.content if exclude_comments: item = re.sub(re.compile('', re.DOTALL), '', item) results = [] for element in __get_dom_elements(item, name, attrs): attribs = __get_attribs(element) if req and not req <= set(attribs.keys()): continue temp = __get_dom_content(item, name, element).strip() results.append(DomMatch(attribs, temp)) item = item[item.find(temp, item.find(element)):] all_results += results return all_results