###########################################################################
# clive, video extraction utility
# Copyright (C) 2007 Toni Gundogdu
#
# clive is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 0.1.2-1307 USA
###########################################################################

import formatter

from htmllib import HTMLParser
from cStringIO import StringIO

from clive.util import *


__all__ = [
  'EmbedParser',
  'PageParser'
]


class EmbedParser(HTMLParser):
  """Parses HTML for <embed> tags"""
  def __init__(self):
    self.vurls = []
    fmt = formatter.AbstractFormatter(formatter.NullWriter())
    HTMLParser.__init__(self,fmt)

  def unknown_starttag(self, tag, attr):
    if tag.lower().find('embed') != -1:
      for a in attr:
        if len(a) > 1:
          if a[0] == 'src':
            self._appendurl(a[1])

  # Non-public

  def _appendurl(self, url):
    """Append URL to list if it qualifies"""
    
    if url.startswith('/'):
      # Must be a path, insert the host
      # FIXME: ytube/vgoogle support for domain name variants
      # (e.g. video.google.ca)
      if self.title.find('YouTube -') != -1:
        url = 'http://www.youtube.com' + url
      elif self.title.find('- Google Video') != -1:
        url = 'http://video.google.com' + url
      elif self.title.find('- Dailymotion') != -1:
        # NOTE: see TODO for some details
        raise CliveError('error : <embed> unsupported (dailymotion)')
      else:
        raise CliveError('error : unknown host (tried to id from title)')

    # Convert embed to video page URL (if app.)
    url = normalize_url(url)

    # Ignore those that do not have YouTube/VGoogle 'signature'
    if url.find('/watch?v=') == -1:
      if url.find('/videoplay?') == -1:
        return

    # Look for duplicates
    if url in self.vurls:
      return

    self.vurls.append(url)


class PageParser:
  """Parses HTML for video output filename and extraction URL."""
  def parse(self, page, url, opts):
    # Title -> filename
    fmt = formatter.AbstractFormatter(formatter.NullWriter())
    hp = HTMLParser(fmt)
    hp.feed(StringIO(page.data).read())
    hp.close()
    filename = parse_fname(hp.title, url, opts)
    self.title = hp.title

    # Website specific video URL extraction
    if url.find('youtube.com') != -1:
      vurl = self._parseyt(page)
    elif url.find('video.google.') != -1:
      vurl = self._parsegv(page)
    elif url.find('dailymotion.com') != -1:
      vurl = self._parsedm(page)
    else:
      (host,path) = parse_url(url)
      raise CliveError('error : invalid host (%s)' % host)

    return (vurl,filename)

  def _parseyt(self, page):
    """
    Constructs video extraction URL.

    YouTube has a unique ID for each video on the website.
    These IDs can be extracted from the video page HTML,
    and be used with the 'get_video' (see below) to extract
    the actual flash video from the website.
    """
    text = parse_fromto(page.data,'player2.swf','"')

    if len(text) == 0:
      # Check for specific errors
      if page.data.find('please verify you are 18 or older') != -1:
        raise CliveError('error : terminated (reason: age-verification)')
      if page.data.find('may contain content that is inappropriate') != -1:
        raise CliveError('error : terminated (reason: inappropriate content)')
      if page.data.lower().find('this video is no longer available') != -1:
        raise CliveError('error : terminated (reason: video removed)')
      else:
        raise CliveError('error : terminated (reason: video id not found)')

    s = text.split("'")
    return 'http://www.youtube.com/get_video?video_id=' + \
              s[4] + '&t=' + s[8]
    
  def _parsegv(self, page):
    """
    Constructs video extraction URL.

    Video extraction link is now part of the googleplayer.swf
    path.
    """
    text = parse_fromto(page.data, 'googleplayer.swf', '"')
    return normalize_url(text.split('=',1)[1]) # use 'videoUrl='

  def _parsedm(self, page):
    """
    Constructs video extraction URL.

    Dailymotion extraction URLs can be found within the page HTML.
    """
    text = parse_fromto(page.data,'url=','"')

    if len(text) == 0:
      if page.data.find('Content deleted.') != -1:
        raise CliveError('error : terminated (reason: video removed)')
      if page.data.find('Explicit content.') != -1:
        raise CliveError('error : terminated (reason: inappropriate content)')
      else:
        raise CliveError('error : terminated (reason: video url not found)')

    return normalize_url(text.split('url=',2)[2])
