# -*- coding: utf-8 -*-
# Part of Odoo. See LICENSE file for full copyright and licensing details.
import base64
import collections
import logging
import random
import re
import socket
import threading
import time
from email.utils import getaddresses
from urllib.parse import urlparse
import html as htmllib
import idna
import markupsafe
from lxml import etree, html
from lxml.html import clean
from werkzeug import urls
import odoo
from odoo.loglevels import ustr
from odoo.tools import misc
_logger = logging.getLogger(__name__)
#----------------------------------------------------------
# HTML Sanitizer
#----------------------------------------------------------
safe_attrs = clean.defs.safe_attrs | frozenset(
['style',
'data-o-mail-quote', 'data-o-mail-quote-node', # quote detection
'data-oe-model', 'data-oe-id', 'data-oe-field', 'data-oe-type', 'data-oe-expression', 'data-oe-translation-initial-sha', 'data-oe-nodeid',
'data-last-history-steps', 'data-oe-protected', 'data-oe-transient-content',
'data-publish', 'data-id', 'data-res_id', 'data-interval', 'data-member_id', 'data-scroll-background-ratio', 'data-view-id',
'data-class', 'data-mimetype', 'data-original-src', 'data-original-id', 'data-gl-filter', 'data-quality', 'data-resize-width',
'data-shape', 'data-shape-colors', 'data-file-name', 'data-original-mimetype',
'data-behavior-props', 'data-prop-name', # knowledge commands
'data-mimetype-before-conversion',
])
SANITIZE_TAGS = {
# allow new semantic HTML5 tags
'allow_tags': clean.defs.tags | frozenset('article bdi section header footer hgroup nav aside figure main'.split() + [etree.Comment]),
'kill_tags': ['base', 'embed', 'frame', 'head', 'iframe', 'link', 'meta',
'noscript', 'object', 'script', 'style', 'title'],
'remove_tags': ['html', 'body'],
}
class _Cleaner(clean.Cleaner):
_style_re = re.compile(r'''([\w-]+)\s*:\s*((?:[^;"']|"[^";]*"|'[^';]*')+)''')
_style_whitelist = [
'font-size', 'font-family', 'font-weight', 'font-style', 'background-color', 'color', 'text-align',
'line-height', 'letter-spacing', 'text-transform', 'text-decoration', 'text-decoration', 'opacity',
'float', 'vertical-align', 'display',
'padding', 'padding-top', 'padding-left', 'padding-bottom', 'padding-right',
'margin', 'margin-top', 'margin-left', 'margin-bottom', 'margin-right',
'white-space',
# box model
'border', 'border-color', 'border-radius', 'border-style', 'border-width', 'border-top', 'border-bottom',
'height', 'width', 'max-width', 'min-width', 'min-height',
# tables
'border-collapse', 'border-spacing', 'caption-side', 'empty-cells', 'table-layout']
_style_whitelist.extend(
['border-%s-%s' % (position, attribute)
for position in ['top', 'bottom', 'left', 'right']
for attribute in ('style', 'color', 'width', 'left-radius', 'right-radius')]
)
strip_classes = False
sanitize_style = False
def __call__(self, doc):
super(_Cleaner, self).__call__(doc)
# if we keep attributes but still remove classes
if not getattr(self, 'safe_attrs_only', False) and self.strip_classes:
for el in doc.iter(tag=etree.Element):
self.strip_class(el)
# if we keep style attribute, sanitize them
if not self.style and self.sanitize_style:
for el in doc.iter(tag=etree.Element):
self.parse_style(el)
def strip_class(self, el):
if el.attrib.get('class'):
del el.attrib['class']
def parse_style(self, el):
attributes = el.attrib
styling = attributes.get('style')
if styling:
valid_styles = collections.OrderedDict()
styles = self._style_re.findall(styling)
for style in styles:
if style[0].lower() in self._style_whitelist:
valid_styles[style[0].lower()] = style[1]
if valid_styles:
el.attrib['style'] = '; '.join('%s:%s' % (key, val) for (key, val) in valid_styles.items())
else:
del el.attrib['style']
def tag_quote(el):
def _create_new_node(tag, text, tail=None, attrs=None):
new_node = etree.Element(tag)
new_node.text = text
new_node.tail = tail
if attrs:
for key, val in attrs.items():
new_node.set(key, val)
return new_node
def _tag_matching_regex_in_text(regex, node, tag='span', attrs=None):
text = node.text or ''
if not re.search(regex, text):
return
child_node = None
idx, node_idx = 0, 0
for item in re.finditer(regex, text):
new_node = _create_new_node(tag, text[item.start():item.end()], None, attrs)
if child_node is None:
node.text = text[idx:item.start()]
new_node.tail = text[item.end():]
node.insert(node_idx, new_node)
else:
child_node.tail = text[idx:item.start()]
new_node.tail = text[item.end():]
node.insert(node_idx, new_node)
child_node = new_node
idx = item.end()
node_idx = node_idx + 1
el_class = el.get('class', '') or ''
el_id = el.get('id', '') or ''
# gmail or yahoo // # outlook, html // # msoffice
if 'gmail_extra' in el_class or \
'divRplyFwdMsg' in el_id or \
('SkyDrivePlaceholder' in el_class or 'SkyDrivePlaceholder' in el_class):
el.set('data-o-mail-quote', '1')
if el.getparent() is not None:
el.getparent().set('data-o-mail-quote-container', '1')
if (el.tag == 'hr' and ('stopSpelling' in el_class or 'stopSpelling' in el_id)) or \
'yahoo_quoted' in el_class:
# Quote all elements after this one
el.set('data-o-mail-quote', '1')
for sibling in el.itersiblings(preceding=False):
sibling.set('data-o-mail-quote', '1')
# html signature (--
blah)
signature_begin = re.compile(r"((?:(?:^|\n)[-]{2}[\s]?$))")
if el.text and el.find('br') is not None and re.search(signature_begin, el.text):
el.set('data-o-mail-quote', '1')
if el.getparent() is not None:
el.getparent().set('data-o-mail-quote-container', '1')
# text-based quotes (>, >>) and signatures (-- Signature)
text_complete_regex = re.compile(r"((?:\n[>]+[^\n\r]*)+|(?:(?:^|\n)[-]{2}[\s]?[\r\n]{1,2}[\s\S]+))")
if not el.get('data-o-mail-quote'):
_tag_matching_regex_in_text(text_complete_regex, el, 'span', {'data-o-mail-quote': '1'})
if el.tag == 'blockquote':
# remove single node
el.set('data-o-mail-quote-node', '1')
el.set('data-o-mail-quote', '1')
if el.getparent() is not None and (el.getparent().get('data-o-mail-quote') or el.getparent().get('data-o-mail-quote-container')) and not el.getparent().get('data-o-mail-quote-node'):
el.set('data-o-mail-quote', '1')
def html_normalize(src, filter_callback=None):
""" Normalize `src` for storage as an html field value.
The string is parsed as an html tag soup, made valid, then decorated for
"email quote" detection, and prepared for an optional filtering.
The filtering step (e.g. sanitization) should be performed by the
`filter_callback` function (to avoid multiple parsing operations, and
normalize the result).
:param src: the html string to normalize
:param filter_callback: optional callable taking a single `etree._Element`
document parameter, to be called during normalization in order to
filter the output document
"""
if not src:
return src
src = ustr(src, errors='replace')
# html: remove encoding attribute inside tags
doctype = re.compile(r'(<[^>]*\s)(encoding=(["\'][^"\']*?["\']|[^\s\n\r>]+)(\s[^>]*|/)?>)', re.IGNORECASE | re.DOTALL)
src = doctype.sub(u"", src)
try:
src = src.replace('--!>', '-->')
src = re.sub(r'(|)', '', src)
# On the specific case of Outlook desktop it adds unnecessary '
ParserError when sanitizing
' except Exception: if not silent: raise logger.warning(u'unknown error obtained when sanitizing %r', src, exc_info=True) sanitized = 'Unknown error when sanitizing
' return markupsafe.Markup(sanitized) # ---------------------------------------------------------- # HTML/Text management # ---------------------------------------------------------- URL_REGEX = r'(\bhref=[\'"](?!mailto:|tel:|sms:)([^\'"]+)[\'"])' TEXT_URL_REGEX = r'https?://[\w@:%.+&~#=/-]+(?:\?\S+)?' # retrieve inner content of the link HTML_TAG_URL_REGEX = URL_REGEX + r'([^<>]*>([^<>]+)<\/)?' HTML_TAGS_REGEX = re.compile('<.*?>') HTML_NEWLINES_REGEX = re.compile('<(div|p|br|tr)[^>]*>|\n') def validate_url(url): if urls.url_parse(url).scheme not in ('http', 'https', 'ftp', 'ftps'): return 'http://' + url return url def is_html_empty(html_content): """Check if a html content is empty. If there are only formatting tags with style attributes or a void content return True. Famous use case if a '``
- convert url into clickable link
- 2 or more consecutive ``
`` are considered as paragraph breaks
:param str text: plaintext to convert
:param str container_tag: container of the html; by default the content is
embedded into a ``
' br_tags = re.compile(r'(([<]\s*[bB][rR]\s*/?[>]\s*){2,})') for item in re.finditer(br_tags, text): final += text[idx:item.start()] + '
' idx = item.end() final += text[idx:] + '
' # 5. container if container_tag: # FIXME: validate that container_tag is just a simple tag? final = '<%s>%s%s>' % (container_tag, final, container_tag) return markupsafe.Markup(final) def append_content_to_html(html, content, plaintext=True, preserve=False, container_tag=None): """ Append extra content at the end of an HTML snippet, trying to locate the end of the HTML document (,