mail/tools/link_preview.py

# -*- coding: utf-8 -*-
# Part of Odoo. See LICENSE file for full copyright and licensing details.

from lxml import html
import requests


def get_link_preview_from_url(url, request_session=None):
    """
    Get the Open Graph properties of an url. (https://ogp.me/)
    If the url leads directly to an image mimetype, return
    the url as preview image else retrieve the properties from
    the html page.

    Using a stream request to prevent loading the whole page
    as those properties are declared in the <head> tag.

    The request session is optional as in some cases using
    a session could be beneficial performance wise
    (e.g. a lot of url could have the same domain).
    """
    # Some websites are blocking non browser user agent.
    user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'}
    try:
        if request_session:
            response = request_session.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)
        else:
            response = requests.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)
    except requests.exceptions.RequestException:
        return False
    if not response.ok or not response.headers.get('Content-Type'):
        return False
    # Content-Type header can return a charset, but we just need the
    # mimetype (eg: image/jpeg;charset=ISO-8859-1)
    content_type = response.headers['Content-Type'].split(';')
    if response.headers['Content-Type'].startswith('image/'):
        return {
            'image_mimetype': content_type[0],
            'og_image': url, # If the url mimetype is already an image type, set url as preview image
            'source_url': url,
        }
    elif response.headers['Content-Type'].startswith('text/html'):
        return get_link_preview_from_html(url, response)
    return False

def get_link_preview_from_html(url, response):
    """
    Retrieve the Open Graph properties from the html page. (https://ogp.me/)
    Load the page with chunks of 8kb to prevent loading the whole
    html when we only need the <head> tag content.
    Fallback on the <title> tag if the html doesn't have
    any Open Graph title property.
    """
    content = b""
    for chunk in response.iter_content(chunk_size=8192):
        content += chunk
        pos = content.find(b'</head>', -8196 * 2)
        # Stop reading once all the <head> data is found
        if pos != -1:
            content = content[:pos + 7]
            break

    if not content:
        return False
    tree = html.fromstring(content)
    og_title = tree.xpath('//meta[@property="og:title"]/@content')
    if og_title:
        og_title = og_title[0]
    elif tree.find('.//title') is not None:
        # Fallback on the <title> tag if it exists
        og_title = tree.find('.//title').text
    else:
        return False
    og_description = tree.xpath('//meta[@property="og:description"]/@content')
    og_type = tree.xpath('//meta[@property="og:type"]/@content')
    og_site_name = tree.xpath('//meta[@property="og:site_name"]/@content')
    og_image = tree.xpath('//meta[@property="og:image"]/@content')
    og_mimetype = tree.xpath('//meta[@property="og:image:type"]/@content')
    return {
        'og_description': og_description[0] if og_description else None,
        'og_image': og_image[0] if og_image else None,
        'og_mimetype': og_mimetype[0] if og_mimetype else None,
        'og_title': og_title,
        'og_type': og_type[0] if og_type else None,
        'og_site_name': og_site_name[0] if og_site_name else None,
        'source_url': url,
    }
initial commit 2024-05-03 12:40:35 +03:00			`# -- coding: utf-8 --`
			`# Part of Odoo. See LICENSE file for full copyright and licensing details.`

			`from lxml import html`
			`import requests`


			`def get_link_preview_from_url(url, request_session=None):`
			`"""`
			`Get the Open Graph properties of an url. (https://ogp.me/)`
			`If the url leads directly to an image mimetype, return`
			`the url as preview image else retrieve the properties from`
			`the html page.`

			`Using a stream request to prevent loading the whole page`
			`as those properties are declared in the <head> tag.`

			`The request session is optional as in some cases using`
			`a session could be beneficial performance wise`
			`(e.g. a lot of url could have the same domain).`
			`"""`
			`# Some websites are blocking non browser user agent.`
			`user_agent = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0'}`
			`try:`
			`if request_session:`
			`response = request_session.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)`
			`else:`
			`response = requests.get(url, timeout=3, headers=user_agent, allow_redirects=True, stream=True)`
			`except requests.exceptions.RequestException:`
			`return False`
			`if not response.ok or not response.headers.get('Content-Type'):`
			`return False`
			`# Content-Type header can return a charset, but we just need the`
			`# mimetype (eg: image/jpeg;charset=ISO-8859-1)`
			`content_type = response.headers['Content-Type'].split(';')`
			`if response.headers['Content-Type'].startswith('image/'):`
			`return {`
			`'image_mimetype': content_type[0],`
			`'og_image': url, # If the url mimetype is already an image type, set url as preview image`
			`'source_url': url,`
			`}`
			`elif response.headers['Content-Type'].startswith('text/html'):`
			`return get_link_preview_from_html(url, response)`
			`return False`

			`def get_link_preview_from_html(url, response):`
			`"""`
			`Retrieve the Open Graph properties from the html page. (https://ogp.me/)`
			`Load the page with chunks of 8kb to prevent loading the whole`
			`html when we only need the <head> tag content.`
			`Fallback on the <title> tag if the html doesn't have`
			`any Open Graph title property.`
			`"""`
			`content = b""`
			`for chunk in response.iter_content(chunk_size=8192):`
			`content += chunk`
			`pos = content.find(b'</head>', -8196 * 2)`
			`# Stop reading once all the <head> data is found`
			`if pos != -1:`
			`content = content[:pos + 7]`
			`break`

			`if not content:`
			`return False`
			`tree = html.fromstring(content)`
			`og_title = tree.xpath('//meta[@property="og:title"]/@content')`
			`if og_title:`
			`og_title = og_title[0]`
			`elif tree.find('.//title') is not None:`
			`# Fallback on the <title> tag if it exists`
			`og_title = tree.find('.//title').text`
			`else:`
			`return False`
			`og_description = tree.xpath('//meta[@property="og:description"]/@content')`
			`og_type = tree.xpath('//meta[@property="og:type"]/@content')`
			`og_site_name = tree.xpath('//meta[@property="og:site_name"]/@content')`
			`og_image = tree.xpath('//meta[@property="og:image"]/@content')`
			`og_mimetype = tree.xpath('//meta[@property="og:image:type"]/@content')`
			`return {`
			`'og_description': og_description[0] if og_description else None,`
			`'og_image': og_image[0] if og_image else None,`
			`'og_mimetype': og_mimetype[0] if og_mimetype else None,`
			`'og_title': og_title,`
			`'og_type': og_type[0] if og_type else None,`
			`'og_site_name': og_site_name[0] if og_site_name else None,`
			`'source_url': url,`
			`}`