#!/usr/bin/ruby
#
# Author: Tatsuki Sugiura <sugi@nemui.org>
# License: Ruby's
#

require 'rss/2.0'
require 'rss/maker'
require 'htree'
require 'open-uri'
require 'pstore'
require 'net/http'
require 'uri'
require 'time'
require 'nkf'
Net::HTTP.version_1_2

class HtmlFetcher
  def initialize(uri, option={})
    @uri = uri
    @opt = {
      :cache => "http-fetch.cache",
    }
    @opt.update option
    @fetched_p = false
    @cache = PStore.new(@opt[:cache])
    @htree = nil
  end

  def fetch
    begin
      html_source = open(@uri,
                         "If-Modified-Since" => last_fetch_time.httpdate)
    rescue OpenURI::HTTPError => e
      if /^304 /.match(e.message)
        @cache.transaction {
          @htree = @cache[@uri][:htree]
        }
      else
        raise
      end
    end
    if html_source
      @fetched_p = true
      begin
        @htree = HTree(NKF.nkf('-w', html_source.read))
      end
      @cache.transaction {
        @cache[@uri] = {:htree => @htree,
          :last_modified => html_source.last_modified }
      }
    end
    @htree
  end

  def fechted?
    @fetched_p
  end

  def last_fetch_time
    t = nil
    @cache.transaction { @cache[@uri] and t = @cache[@uri][:last_modified] }
    t ? t : Time.at(0)
  end
end

class HTML2PodCast
  def initialize(uri, opt = {})
    @uri = uri
    @opt = {
      :cache => "http-fetch.cache",
    }
    @opt.update opt
    @convproc = default_convproc
  end

  def default_convproc
    Proc.new { |rss, html|
      html.elements.each("//a") { |link|
        next unless link.attributes["href"] =~ /\.(mp3|ogg|wma|flac)$/io;
        item = rss.items.new_item
        item.link = @uri
        enc_uri = URI.parse(@uri) + URI.parse(link.attributes["href"])
        item.title = link.text
	# can NOT find description automatically...
        item.description = "Page URL: #{@uri}\nEnclosure: #{enc_uri}"
        enc_head = http_req_head(enc_uri)
        item.enclosure.url = enc_uri
        item.enclosure.type = enc_head["content-type"][0]
        item.enclosure.length = enc_head["content-length"][0].to_i
        item.pubDate = Time.parse(enc_head["last-modified"][0])
      }
    }
  end

  def to_rss
    fetcher = HtmlFetcher.new(@uri)
    h = fetcher.fetch
    x = h.to_rexml
    convert(x).to_s
  end

  def convert(html)
    RSS::Maker.make("2.0") {|rss|
      rss.items.do_sort = true
      rss.channel.link = @uri
      rss.channel.title = html.elements["/html/head/title"].text
      rss.channel.description = "Autogenarated podcast XML for #{@uri}"
      @convproc.call(rss, html)
    }
  end

  def convproc=(proc = Proc.new)
    @convproc = proc
  end

  def http_req_head(uri, opt = {})
    if ENV.member? "http_proxy"
      proxy = URI.parse(ENV["http_proxy"])
      proxy_host = proxy.host
      proxy_port = proxy.port
    end
    uri = URI.parse(uri.to_s)
    http = Net::HTTP.new(uri.host, uri.port || 80, proxy_host, proxy_port)
    http.head(uri.path).each_header {}
  end

end

if __FILE__ == $0
  uri = ARGV[0] || "http://abab.dip.jp/piano/"
  
  h2p = HTML2PodCast.new(uri)

  if uri == "http://abab.dip.jp/piano/"
    # convproc sample for abab.dip.jp. 
    # Thanks lot for great tunes!
    h2p.convproc = Proc.new {|rss, html|
      rss.channel.description = html.elements.to_a("/html/body/p")[1].texts.join(" ")
    
      html.elements.each("/html/body/p/table/tr") { |e|
        next unless (link = e.elements["td/a"]) &&
          link.attributes["href"] =~ /\.mp3$/io;
    
        item = rss.items.new_item
        item.link = uri
        enc_uri = URI.parse(uri) + URI.parse(link.attributes["href"])
        enc_head = h2p.http_req_head(enc_uri)
        item.title = link.text
        item.description = e.elements.to_a("td")[1].text
        item.enclosure.url = enc_uri
        item.enclosure.type = "audio/mpeg"
        item.enclosure.length = enc_head["content-length"][0].to_i
        item.pubDate = Time.parse(enc_head["last-modified"][0])
      }
    }
  end

  puts h2p.to_rss
end
