mail/tools/link_preview.py

88 lines
3.5 KiB
Python
Raw Normal View History

2024-05-03 12:40:35 +03:00
# -*- coding: utf-8 -*-
# Part of Odoo. See LICENSE file for full copyright and licensing details.
from lxml import html
import requests
def get_link_preview_from_url(url, request_session=None):
"""
Get the Open Graph properties of an url. (https://ogp.me/)
If the url leads directly to an image mimetype, return
the url as preview image else retrieve the properties from
the html page.
Using a stream request to prevent loading the whole page
as those properties are declared in the <head> tag.
The request session is optional as in some cases using
a session could be beneficial performance wise
(e.g. a lot of url could have the same domain).
"""
# Some websites are blocking non browser user agent.
user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'}
try:
if request_session:
response = request_session.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)
else:
response = requests.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)
except requests.exceptions.RequestException:
return False
if not response.ok or not response.headers.get('Content-Type'):
return False
# Content-Type header can return a charset, but we just need the
# mimetype (eg: image/jpeg;charset=ISO-8859-1)
content_type = response.headers['Content-Type'].split(';')
if response.headers['Content-Type'].startswith('image/'):
return {
'image_mimetype': content_type[0],
'og_image': url, # If the url mimetype is already an image type, set url as preview image
'source_url': url,
}
elif response.headers['Content-Type'].startswith('text/html'):
return get_link_preview_from_html(url, response)
return False
def get_link_preview_from_html(url, response):
"""
Retrieve the Open Graph properties from the html page. (https://ogp.me/)
Load the page with chunks of 8kb to prevent loading the whole
html when we only need the <head> tag content.
Fallback on the <title> tag if the html doesn't have
any Open Graph title property.
"""
content = b""
for chunk in response.iter_content(chunk_size=8192):
content += chunk
pos = content.find(b'</head>', -8196 * 2)
# Stop reading once all the <head> data is found
if pos != -1:
content = content[:pos + 7]
break
if not content:
return False
tree = html.fromstring(content)
og_title = tree.xpath('//meta[@property="og:title"]/@content')
if og_title:
og_title = og_title[0]
elif tree.find('.//title') is not None:
# Fallback on the <title> tag if it exists
og_title = tree.find('.//title').text
else:
return False
og_description = tree.xpath('//meta[@property="og:description"]/@content')
og_type = tree.xpath('//meta[@property="og:type"]/@content')
og_site_name = tree.xpath('//meta[@property="og:site_name"]/@content')
og_image = tree.xpath('//meta[@property="og:image"]/@content')
og_mimetype = tree.xpath('//meta[@property="og:image:type"]/@content')
return {
'og_description': og_description[0] if og_description else None,
'og_image': og_image[0] if og_image else None,
'og_mimetype': og_mimetype[0] if og_mimetype else None,
'og_title': og_title,
'og_type': og_type[0] if og_type else None,
'og_site_name': og_site_name[0] if og_site_name else None,
'source_url': url,
}