/usr/bin/samizdat-import-feeds is in samizdat 0.7.0-1.
This file is owned by root:root, with mode 0o755.
The actual contents of the file can be viewed below.
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 | #!/usr/bin/env ruby
#
# Samizdat syndication feeds updater
#
#   Copyright (c) 2002-2011  Dmitry Borodaenko <angdraug@debian.org>
#
#   This program is free software.
#   You can distribute/modify this program under the terms of
#   the GNU General Public License version 3 or later.
#
# vim: et sw=2 sts=2 ts=8 tw=0
require 'samizdat'
require 'samizdat/helpers/syndication_helper'
require 'rss/1.0'
require 'rss/2.0'
require 'rss/dublincore'
require 'rss/maker'
require 'timeout'
class FeedUpdaterError < RuntimeError; end
class FeedUpdater
  include SyndicationHelper
  CONNECTION_TIMEOUT = 60   # 1 minute
  def initialize
    @sites = SamizdatSites.instance.all
  end
  # cycle through all sites and update all configured imported feeds
  #
  # configuration:
  #
  #   import_feeds:
  #     name:
  #       url: http://example.com/feed.rss
  #       limit: 5
  #
  # or, to use limit:page limit:
  #
  #   import_feeds:
  #     name: http://example.com/feed.rss
  #
  # each feed is only fetched once per URL across all sites
  #
  def run
    feeds = {}
    @sites.each do |site_name|
      @site = Site.new(site_name)
      # Only sites with a shared remote cache can import feeds.
      # See Site#initialize().
      cache_uri = config.cache or next
      each_import_feed do |feed_name, url, limit|
        # fetch once per url across shared caches of all sites
        (feeds[url] ||= {})[cache_uri] ||= shared_cache if url
      end
    end
    feeds.each do |url, caches|
      begin
        update_feed(url, caches)
      rescue FeedUpdaterError => error
        log error
        next   # just ignore the feed if it can't be updated
      end
    end
    flush_pages_with_syndication(feeds)
  end
  private
  def update_feed(url, caches)
    response = nil   # scope fix
    begin
      Timeout.timeout(CONNECTION_TIMEOUT) do
        response = open(url) {|file| file.read }
      end
    rescue => error
      raise FeedUpdaterError, "Failed to fetch feed from #{url}: " + error.message
    end
    begin
      feed = parse_feed(response)
    rescue => error
      raise FeedUpdaterError, "Failed to parse feed from #{url}: " + error.message
    end
    caches.each_value do |c|
      c['samizdat/*/import_feeds/' + url] = feed   # '*' to avoid clashes with site_name
    end
  end
  def parse_feed(response)
    # Remove tag section not needed and known to be buggy for invalid "mn" type
    # URI http://usefulinc.com/rss/manifest/
    #
    # fixme: explain this better
    #
    if response =~ %r{http://usefulinc.com/rss/manifest/}
      response.sub!(%r{<rdf:Description(.*\n)*?.*mn:channels.*(.*\n)*?.*</rdf:Description>}, '')
    end
    begin
      rss = RSS::Parser.parse(response)   # try RSS 1.0 compliant parser first
    rescue RSS::Error
      rss = RSS::Parser.parse(response, false)   # fall back to non RSS 1.0 compliant
    end
    rss.respond_to?(:items) or raise FeedUpdaterError, "Failed to parse RSS"
    # don't store more than limit:page items
    rss.items[0, limit_page].collect {|item|
      {
        'link' => item.link.strip,
        'title' => item.title.strip,
        'date' => item.date
      }
    }
  end
  def flush_pages_with_syndication(feeds)
    # make sure we only flush each affected shared cache once
    all_caches = {}
    feeds.each_value do |caches|
      all_caches.merge!(caches)
    end
    all_caches.each_value do |c|
      c.flush(%r{\Asamizdat/[^/]+/index/})
    end
  end
end
FeedUpdater.new.run
 |