search
menu
person

NEWS AND UDATES


18:14
Очистка HTML с помощью html5lib
Задача - очистить текст от тегов которые не разрешены при добавлении.
Примерное решение задачи с помощью html5lib
Code
# -*- coding:utf-8 -*-
import html5lib
from html5lib import sanitizer, treebuilders, treewalkers, serializer

DEFAULT_ALLOWED_TAGS = ('a', 'p', 'img', 'h2', 'table', 'thead', 'tbody', 'tr', 'td', 'th')

class AllowedTagsSanitizer(sanitizer.HTMLSanitizer):
  """
  a strict sanitizer which only allows certain tags (must be set via class attribute)
  """
  def __init__(self, *args, **kwargs):
  self.allowed_elements = self._allowed_tags
  self.allowed_attributes = []
  if 'a' in self.allowed_elements:
  self.allowed_attributes.append('href')
  if 'img' in self.allowed_elements:
  self.allowed_attributes.append('src')
  self.allowed_css_keywords = []
  self.allowed_css_properties = []
  self.allowed_protocols = ['http', 'https']  
  super(AllowedTagsSanitizer, self).__init__(*args, **kwargs)
  self.lowercaseElementName=True
  self.lowercaseAttrName=True  

  def sanitize_token(self, token):
  if 'name' not in token:
  # allow non-elements
  return super(AllowedTagsSanitizer, self).sanitize_token(token)
  else:  
  if token['name'] in self.allowed_elements:
  return super(AllowedTagsSanitizer, self).sanitize_token(token)

def clean_html(buf, allowed_tags=DEFAULT_ALLOWED_TAGS):
  """Cleans HTML of dangerous tags and content."""
  buf = buf.strip()
  if not buf:
  return buf

  html_parser = html5lib.HTMLParser(tree=treebuilders.getTreeBuilder("dom"), tokenizer=AllowedTagsSanitizer)
  html_parser.tokenizer_class._allowed_tags = allowed_tags
  dom_tree = html_parser.parseFragment(buf)

  walker = treewalkers.getTreeWalker("dom")
  stream = walker(dom_tree)

  s = serializer.htmlserializer.HTMLSerializer(omit_optional_tags=False,quote_attr_values=True)
  return s.render(stream, 'utf-8')

if __name__ == '__main__':
  str = """
  <p style="color: red">dirty<font size=5>html</font></p>
  """
  print "\nBefore:", str
  print "\nAfter:", clean_html(str)


источник: http://softwaremaniacs.org/forum/python/19801/
Просмотров: 1258 | Добавил: django | Рейтинг: 0.0/0
Всего комментариев: 0
dth="100%" cellspacing="1" cellpadding="2" class="commTable">
Имя *: Email:
Код *: