plugin.video.viervijfzes/resources/lib/viervijfzes/content.py

# -*- coding: utf-8 -*-
""" AUTH API """

from __future__ import absolute_import, division, unicode_literals

import json
import logging
import os
import re
import time
from datetime import datetime

import requests

from resources.lib.kodiutils import STREAM_DASH, STREAM_HLS
from resources.lib.viervijfzes import ResolvedStream

try:  # Python 3
    from html import unescape
except ImportError:  # Python 2
    from HTMLParser import HTMLParser

    unescape = HTMLParser().unescape

_LOGGER = logging.getLogger(__name__)

CACHE_AUTO = 1  # Allow to use the cache, and query the API if no cache is available
CACHE_ONLY = 2  # Only use the cache, don't use the API
CACHE_PREVENT = 3  # Don't use the cache


class UnavailableException(Exception):
    """ Is thrown when an item is unavailable. """


class NoContentException(Exception):
    """ Is thrown when no items are unavailable. """


class GeoblockedException(Exception):
    """ Is thrown when a geoblocked item is played. """


class Program:
    """ Defines a Program. """

    def __init__(self, uuid=None, path=None, channel=None, title=None, description=None, aired=None, cover=None, background=None, seasons=None, episodes=None,
                 clips=None, my_list=False):
        """
        :type uuid: str
        :type path: str
        :type channel: str
        :type title: str
        :type description: str
        :type aired: datetime
        :type cover: str
        :type background: str
        :type seasons: list[Season]
        :type episodes: list[Episode]
        :type clips: list[Episode]
        :type my_list: bool
        """
        self.uuid = uuid
        self.path = path
        self.channel = channel
        self.title = title
        self.description = description
        self.aired = aired
        self.cover = cover
        self.background = background
        self.seasons = seasons
        self.episodes = episodes
        self.clips = clips
        self.my_list = my_list

    def __repr__(self):
        return "%r" % self.__dict__


class Season:
    """ Defines a Season. """

    def __init__(self, uuid=None, path=None, channel=None, title=None, description=None, cover=None, number=None):
        """
        :type uuid: str
        :type path: str
        :type channel: str
        :type title: str
        :type description: str
        :type cover: str
        :type number: int

        """
        self.uuid = uuid
        self.path = path
        self.channel = channel
        self.title = title
        self.description = description
        self.cover = cover
        self.number = number

    def __repr__(self):
        return "%r" % self.__dict__


class Episode:
    """ Defines an Episode. """

    def __init__(self, uuid=None, nodeid=None, path=None, channel=None, program_title=None, title=None, description=None, cover=None, background=None,
                 duration=None, season=None, season_uuid=None, number=None, rating=None, aired=None, expiry=None, stream=None):
        """
        :type uuid: str
        :type nodeid: str
        :type path: str
        :type channel: str
        :type program_title: str
        :type title: str
        :type description: str
        :type cover: str
        :type background: str
        :type duration: int
        :type season: int
        :type season_uuid: str
        :type number: int
        :type rating: str
        :type aired: datetime
        :type expiry: datetime
        :type stream: string
        """
        self.uuid = uuid
        self.nodeid = nodeid
        self.path = path
        self.channel = channel
        self.program_title = program_title
        self.title = title
        self.description = description
        self.cover = cover
        self.background = background
        self.duration = duration
        self.season = season
        self.season_uuid = season_uuid
        self.number = number
        self.rating = rating
        self.aired = aired
        self.expiry = expiry
        self.stream = stream

    def __repr__(self):
        return "%r" % self.__dict__


class Category:
    """ Defines a Category. """

    def __init__(self, uuid=None, channel=None, title=None, programs=None, episodes=None):
        """
        :type uuid: str
        :type channel: str
        :type title: str
        :type programs: List[Program]
        :type episodes: List[Episode]
        """
        self.uuid = uuid
        self.channel = channel
        self.title = title
        self.programs = programs
        self.episodes = episodes

    def __repr__(self):
        return "%r" % self.__dict__


class ContentApi:
    """ GoPlay Content API"""
    SITE_URL = 'https://www.goplay.be'
    API_VIERVIJFZES = 'https://api.viervijfzes.be'
    API_GOPLAY = 'https://api.goplay.be'

    def __init__(self, auth=None, cache_path=None):
        """ Initialise object """
        self._session = requests.session()
        self._auth = auth
        self._cache_path = cache_path

    def get_programs(self, channel=None, cache=CACHE_AUTO):
        """ Get a list of all programs of the specified channel.
        :type channel: str
        :type cache: str
        :rtype list[Program]
        """

        def update():
            """ Fetch the program listing by scraping """
            # Load webpage
            raw_html = self._get_url(self.SITE_URL + '/programmas')

            # Parse programs
            regex_programs = re.compile(r'data-program="(?P<json>[^"]+)"', re.DOTALL)

            data = [
                json.loads(unescape(item.group('json')))
                for item in regex_programs.finditer(raw_html)
            ]

            if not data:
                raise Exception('No programs found')

            return data

        # Fetch listing from cache or update if needed
        data = self._handle_cache(key=['programs'], cache_mode=cache, update=update, ttl=30 * 5)
        if not data:
            return []

        if channel:
            programs = [
                self._parse_program_data(record) for record in data if record['pageInfo']['brand'] == channel
            ]
        else:
            programs = [
                self._parse_program_data(record) for record in data
            ]

        return programs

    def get_program(self, path, extract_clips=False, cache=CACHE_AUTO):
        """ Get a Program object from the specified page.
        :type path: str
        :type extract_clips: bool
        :type cache: int
        :rtype Program
        """
        # We want to use the html to extract clips
        # This is the worst hack, since Python 2.7 doesn't support nonlocal
        raw_html = [None]

        def update():
            """ Fetch the program metadata by scraping """
            # Fetch webpage
            page = self._get_url(self.SITE_URL + '/' + path)

            # Store a copy in the parent's raw_html var.
            raw_html[0] = page

            # Extract JSON
            regex_program = re.compile(r'data-hero="([^"]+)', re.DOTALL)
            json_data = unescape(regex_program.search(page).group(1))
            data = json.loads(json_data)['data']

            return data

        # Fetch listing from cache or update if needed
        data = self._handle_cache(key=['program', path], cache_mode=cache, update=update)
        if not data:
            return None

        program = self._parse_program_data(data)

        # Also extract clips if we did a real HTTP call
        if extract_clips and raw_html[0]:
            clips = self._extract_videos(raw_html[0])
            program.clips = clips

        return program

    def get_program_by_uuid(self, uuid, cache=CACHE_AUTO):
        """ Get a Program object with the specified uuid.
        :type uuid: str
        :type cache: str
        :rtype Program
        """
        if not uuid:
            return None

        def update():
            """ Fetch the program metadata """
            # Fetch webpage
            result = self._get_url(self.SITE_URL + '/api/program/%s' % uuid)
            data = json.loads(result)
            return data

        # Fetch listing from cache or update if needed
        data = self._handle_cache(key=['program', uuid], cache_mode=cache, update=update)
        if not data:
            return None

        program = self._parse_program_data(data)

        return program

    def get_episode(self, path, cache=CACHE_AUTO):
        """ Get a Episode object from the specified page.
        :type path: str
        :type cache: str
        :rtype Episode
        """

        def update():
            """ Fetch the program metadata by scraping """
            # Load webpage
            page = self._get_url(self.SITE_URL + '/' + path)

            program_json = None
            episode_json = None

            # Extract video JSON by looking for a data-video tag
            # This is not present on every page
            regex_video_data = re.compile(r'data-video="([^"]+)"', re.DOTALL)
            result = regex_video_data.search(page)
            if result:
                video_id = json.loads(unescape(result.group(1)))['id']
                video_json_data = self._get_url('%s/api/video/%s' % (self.SITE_URL, video_id))
                video_json = json.loads(video_json_data)
                return dict(video=video_json)

            # Extract program JSON
            regex_program = re.compile(r'data-hero="([^"]+)', re.DOTALL)
            result = regex_program.search(page)
            if result:
                program_json_data = unescape(result.group(1))
                program_json = json.loads(program_json_data)['data']

            # Extract episode JSON
            regex_episode = re.compile(r'<script type="application/json" data-drupal-selector="drupal-settings-json">(.*?)</script>', re.DOTALL)
            result = regex_episode.search(page)
            if result:
                episode_json_data = unescape(result.group(1))
                episode_json = json.loads(episode_json_data)

            return dict(program=program_json, episode=episode_json)

        # Fetch listing from cache or update if needed
        data = self._handle_cache(key=['episode', path], cache_mode=cache, update=update)
        if not data:
            return None

        if 'video' in data and data['video']:
            # We have found detailed episode information
            episode = self._parse_episode_data(data['video'])
            return episode

        if 'program' in data and 'episode' in data and data['program'] and data['episode']:
            # We don't have detailed episode information
            # We need to lookup the episode in the program JSON
            program = self._parse_program_data(data['program'])
            for episode in program.episodes:
                if episode.nodeid == data['episode']['pageInfo']['nodeId']:
                    return episode

        return None

    def get_stream_by_uuid(self, uuid):
        """ Get the stream URL to use for this video.
        :type uuid: str
        :rtype str
        """
        response = self._get_url(self.API_VIERVIJFZES + '/content/%s' % uuid, authentication=True)
        data = json.loads(response)

        if 'videoDash' in data:
            # DRM protected stream
            # See https://docs.unified-streaming.com/documentation/drm/buydrm.html#setting-up-the-client
            drm_key = data['drmKey']['S']

            _LOGGER.debug('Fetching Authentication XML with drm_key %s', drm_key)
            response_drm = self._get_url(self.API_GOPLAY + '/restricted/decode/%s' % drm_key, authentication=True)
            data_drm = json.loads(response_drm)

            return ResolvedStream(
                uuid=uuid,
                url=data['videoDash']['S'],
                stream_type=STREAM_DASH,
                license_url='https://wv-keyos.licensekeyserver.com/',
                auth=data_drm.get('auth'),
            )

        # Normal HLS stream
        return ResolvedStream(
            uuid=uuid,
            url=data['video']['S'],
            stream_type=STREAM_HLS,
        )

    # def get_categories(self):
    #     """ Get a list of all categories.
    #     :rtype list[Category]
    #     """
    #     # Load webpage
    #     raw_html = self._get_url(self.SITE_URL)
    #
    #     # Categories regexes
    #     regex_articles = re.compile(r'<article([^>]+)>(.*?)</article>', re.DOTALL)
    #     regex_submenu_id = re.compile(r'data-submenu-id="([^"]*)"')  # splitted since the order might change
    #     regex_submenu_title = re.compile(r'data-submenu-title="([^"]*)"')
    #
    #     categories = []
    #     for result in regex_articles.finditer(raw_html):
    #         article_info_html = result.group(1)
    #         article_html = result.group(2)
    #         category_title = regex_submenu_title.search(article_info_html).group(1)
    #         category_id = regex_submenu_id.search(article_info_html).group(1)
    #
    #         # Skip empty categories or 'All programs'
    #         if not category_id or category_id == 'programmas':
    #             continue
    #
    #         # Extract items
    #         programs = self._extract_programs(article_html, channel)
    #         episodes = self._extract_videos(article_html)
    #         categories.append(Category(uuid=category_id, channel=channel, title=category_title, programs=programs, episodes=episodes))
    #
    #     return categories

    # @staticmethod
    # def _extract_programs(html, channel):
    #     """ Extract Programs from HTML code """
    #     # Item regexes
    #     regex_item = re.compile(r'<a[^>]+?href="(?P<path>[^"]+)"[^>]+?>'
    #                             r'.*?<h3 class="poster-teaser__title"><span>(?P<title>[^<]*)</span></h3>.*?'
    #                             r'</a>', re.DOTALL)
    #
    #     # Extract items
    #     programs = []
    #     for item in regex_item.finditer(html):
    #         path = item.group('path')
    #         if path.startswith('/video'):
    #             continue
    #
    #         title = unescape(item.group('title'))
    #
    #         # Program
    #         programs.append(Program(
    #             path=path.lstrip('/'),
    #             channel=channel,
    #             title=title,
    #         ))
    #
    #     return programs

    @staticmethod
    def _extract_videos(html):
        """ Extract videos from HTML code """
        # Item regexes
        regex_item = re.compile(r'<a[^>]+?href="(?P<path>[^"]+)"[^>]+?>.*?</a>', re.DOTALL)

        # Episode regexes
        regex_episode_title = re.compile(r'<h3 class="(?:poster|card|image)-teaser__title">(?:<span>)?([^<]*)(?:</span>)?</h3>')
        regex_episode_program = re.compile(r'<div class="card-teaser__label">([^<]*)</div>')
        regex_episode_duration = re.compile(r'data-duration="([^"]*)"')
        regex_episode_video_id = re.compile(r'data-videoid="([^"]*)"')
        regex_episode_image = re.compile(r'data-background-image="([^"]*)"')
        regex_episode_timestamp = re.compile(r'data-timestamp="([^"]*)"')

        # Extract items
        episodes = []
        for item in regex_item.finditer(html):
            item_html = item.group(0)
            path = item.group('path')

            # Extract title
            try:
                title = unescape(regex_episode_title.search(item_html).group(1))
            except AttributeError:
                continue

            # This is not a episode
            if not path.startswith('/video'):
                continue

            try:
                episode_program = regex_episode_program.search(item_html).group(1)
            except AttributeError:
                _LOGGER.warning('Found no episode_program for %s', title)
                episode_program = None
            try:
                episode_duration = int(regex_episode_duration.search(item_html).group(1))
            except AttributeError:
                _LOGGER.warning('Found no episode_duration for %s', title)
                episode_duration = None
            try:
                episode_video_id = regex_episode_video_id.search(item_html).group(1)
            except AttributeError:
                _LOGGER.warning('Found no episode_video_id for %s', title)
                episode_video_id = None
            try:
                episode_image = unescape(regex_episode_image.search(item_html).group(1))
            except AttributeError:
                _LOGGER.warning('Found no episode_image for %s', title)
                episode_image = None
            try:
                episode_timestamp = int(regex_episode_timestamp.search(item_html).group(1))
            except AttributeError:
                _LOGGER.warning('Found no episode_timestamp for %s', title)
                episode_timestamp = None

            # Episode
            episodes.append(Episode(
                path=path.lstrip('/'),
                channel='',  # TODO
                title=title,
                duration=episode_duration,
                uuid=episode_video_id,
                aired=datetime.fromtimestamp(episode_timestamp) if episode_timestamp else None,
                cover=episode_image,
                program_title=episode_program,
            ))

        return episodes

    @staticmethod
    def _parse_program_data(data):
        """ Parse the Program JSON.
        :type data: dict
        :rtype Program
        """
        # Create Program info
        program = Program(
            uuid=data['id'],
            path=data['link'].lstrip('/'),
            channel=data['pageInfo']['brand'],
            title=data['title'],
            description=data['description'],
            aired=datetime.fromtimestamp(data.get('pageInfo', {}).get('publishDate')),
            cover=data['images']['poster'],
            background=data['images']['hero'],
        )

        # Create Season info
        program.seasons = {
            key: Season(
                uuid=playlist['id'],
                path=playlist['link'].lstrip('/'),
                channel=playlist['pageInfo']['brand'],
                title=playlist['title'],
                description=playlist['pageInfo']['description'],
                number=playlist['episodes'][0]['seasonNumber'],  # You did not see this
            )
            for key, playlist in enumerate(data['playlists']) if playlist['episodes']
        }

        # Create Episodes info
        program.episodes = [
            ContentApi._parse_episode_data(episode, playlist['id'])
            for playlist in data['playlists']
            for episode in playlist['episodes']
        ]

        return program

    @staticmethod
    def _parse_episode_data(data, season_uuid=None):
        """ Parse the Episode JSON.
        :type data: dict
        :type season_uuid: str
        :rtype Episode
        """

        if data.get('episodeNumber'):
            episode_number = data.get('episodeNumber')
        else:
            # The episodeNumber can be absent
            match = re.compile(r'\d+$').search(data.get('title'))
            if match:
                episode_number = match.group(0)
            else:
                episode_number = None

        episode = Episode(
            uuid=data.get('videoUuid'),
            nodeid=data.get('pageInfo', {}).get('nodeId'),
            path=data.get('link').lstrip('/'),
            channel=data.get('pageInfo', {}).get('site'),
            program_title=data.get('program', {}).get('title') if data.get('program') else data.get('title'),
            title=data.get('title'),
            description=data.get('pageInfo', {}).get('description'),
            cover=data.get('image'),
            background=data.get('image'),
            duration=data.get('duration'),
            season=data.get('seasonNumber'),
            season_uuid=season_uuid,
            number=episode_number,
            aired=datetime.fromtimestamp(data.get('createdDate')),
            expiry=datetime.fromtimestamp(data.get('unpublishDate')) if data.get('unpublishDate') else None,
            rating=data.get('parentalRating'),
            stream=data.get('path'),
        )
        return episode

    def _get_url(self, url, params=None, authentication=False):
        """ Makes a GET request for the specified URL.
        :type url: str
        :rtype str
        """
        if authentication:
            if not self._auth:
                raise Exception('Requested to authenticate, but not auth object passed')
            response = self._session.get(url, params=params, headers={
                'authorization': self._auth.get_token(),
            })
        else:
            response = self._session.get(url, params=params)

        if response.status_code != 200:
            _LOGGER.error(response.text)
            raise Exception('Could not fetch data')

        return response.text

    def _handle_cache(self, key, cache_mode, update, ttl=30 * 24 * 60 * 60):
        """ Fetch something from the cache, and update if needed """
        if cache_mode in [CACHE_AUTO, CACHE_ONLY]:
            # Try to fetch from cache
            data = self._get_cache(key)
            if data is None and cache_mode == CACHE_ONLY:
                return None
        else:
            data = None

        if data is None:
            try:
                # Fetch fresh data
                _LOGGER.debug('Fetching fresh data for key %s', '.'.join(key))
                data = update()
                if data:
                    # Store fresh response in cache
                    self._set_cache(key, data, ttl)
            except Exception as exc:  # pylint: disable=broad-except
                _LOGGER.warning('Something went wrong when refreshing live data: %s. Using expired cached values.', exc)
                data = self._get_cache(key, allow_expired=True)

        return data

    def _get_cache(self, key, allow_expired=False):
        """ Get an item from the cache """
        filename = ('.'.join(key) + '.json').replace('/', '_')
        fullpath = os.path.join(self._cache_path, filename)

        if not os.path.exists(fullpath):
            return None

        if not allow_expired and os.stat(fullpath).st_mtime < time.time():
            return None

        with open(fullpath, 'r') as fdesc:
            try:
                _LOGGER.debug('Fetching %s from cache', filename)
                value = json.load(fdesc)
                return value
            except (ValueError, TypeError):
                return None

    def _set_cache(self, key, data, ttl):
        """ Store an item in the cache """
        filename = ('.'.join(key) + '.json').replace('/', '_')
        fullpath = os.path.join(self._cache_path, filename)

        if not os.path.exists(self._cache_path):
            os.makedirs(self._cache_path)

        with open(fullpath, 'w') as fdesc:
            _LOGGER.debug('Storing to cache as %s', filename)
            json.dump(data, fdesc)

        # Set TTL by modifying modification date
        deadline = int(time.time()) + ttl
        os.utime(fullpath, (deadline, deadline))