18:14 Очистка HTML с помощью html5lib |
Задача - очистить текст от тегов которые не разрешены при добавлении. Примерное решение задачи с помощью html5lib Code # -*- coding:utf-8 -*- import html5lib from html5lib import sanitizer, treebuilders, treewalkers, serializer DEFAULT_ALLOWED_TAGS = ('a', 'p', 'img', 'h2', 'table', 'thead', 'tbody', 'tr', 'td', 'th') class AllowedTagsSanitizer(sanitizer.HTMLSanitizer): """ a strict sanitizer which only allows certain tags (must be set via class attribute) """ def __init__(self, *args, **kwargs): self.allowed_elements = self._allowed_tags self.allowed_attributes = [] if 'a' in self.allowed_elements: self.allowed_attributes.append('href') if 'img' in self.allowed_elements: self.allowed_attributes.append('src') self.allowed_css_keywords = [] self.allowed_css_properties = [] self.allowed_protocols = ['http', 'https'] super(AllowedTagsSanitizer, self).__init__(*args, **kwargs) self.lowercaseElementName=True self.lowercaseAttrName=True def sanitize_token(self, token): if 'name' not in token: # allow non-elements return super(AllowedTagsSanitizer, self).sanitize_token(token) else: if token['name'] in self.allowed_elements: return super(AllowedTagsSanitizer, self).sanitize_token(token) def clean_html(buf, allowed_tags=DEFAULT_ALLOWED_TAGS): """Cleans HTML of dangerous tags and content.""" buf = buf.strip() if not buf: return buf html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=AllowedTagsSanitizer) html_parser.tokenizer_class._allowed_tags = allowed_tags dom_tree = html_parser.parseFragment(buf) walker = treewalkers.getTreeWalker("dom") stream = walker(dom_tree) s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,quote_attr_values=True) return s.render(stream, 'utf-8') if __name__ == '__main__': str = """ <p style="color: red">dirty<font size=5>html</font></p> """ print "\nBefore:", str print "\nAfter:", clean_html(str) источник: http://softwaremaniacs.org/forum/python/19801/ |
|
Всего комментариев: 0 | |