import sys import getopt import os.path from twisted.internet.selectreactor import SelectReactor from twisted.web import client basedir = os.path.dirname(__file__) datadir = os.path.join(basedir, 'data') dbdir = os.path.join(basedir, 'db') libdir = os.path.join(basedir, 'lib') sys.path.append(libdir) import BeautifulSoup from xml.etree import ElementTree import dblib import distlib metafile = os.path.join(basedir, 'meta.txt') def get_info(args): result = [] index = dblib.DB(metafile).uindex('name') if args: for arg in args: if arg in index: info = index[arg] result.append(info) else: for key in index: info = index[key] result.append(info) return result class Updater: def __init__(self, verbose): self.verbose = verbose self.load() def load(self): dists = distlib.get_all_dists() for dist in dists: distobj = distlib.get_distobj(dist) if isinstance(distobj, distlib.Cachable): basename = dist.lower() + '.txt' filename = os.path.join(datadir, basename) if os.path.exists(filename): file = open(filename) print 'Loading', basename distobj.load_cache(file) def init(self, info): self.basename = info.file self.filename = os.path.join(dbdir, info.file) self.db = dblib.DB(self.filename) self.dists = distlib.get_dists(info.dists) self.new = False self.queued = 0 self.done = 0 self.all = len(self.db) self.reactor = SelectReactor() def update(self): print 'Updating', self.basename for row in self.db: distobj = distlib.get_distobj(row.dist) if not distobj.has_version(row.section): self.done += 1 continue if hasattr(distobj, 'cached_version'): updater = RowUpdater(self, row) version = distobj.cached_version(row.section, row.name) updater.handle_version(row, version) self.done += 1 continue url = self.get_url(row) content = self.get_content(row, url) updater = RowUpdater(self, row) content.addCallbacks(updater.update, updater.error) content.addCallback(self.postprocess) self.queued += 1 if self.queued == 100: self.reactor.run() if self.queued: self.reactor.run() if self.done == self.all: if self.new: self.db.write() def postprocess(self, ignore): self.queued -= 1 self.done += 1 sys.stdout.flush() if not self.queued: self.reactor.stop() def get_url(self, row): distobj = distlib.get_distobj(row.dist) if hasattr(distobj, 'version_url'): url = distobj.version_url(row.section, row.name) else: url = distobj.url(row.section, row.name) return url def get_content(self, row, url): content = client.getPage(url) return content class RowUpdater: def __init__(self, parent, row): self.parent = parent self.row = row def get_version(self, row, content): distobj = distlib.get_distobj(row.dist) if hasattr(distobj, 'extract_soup'): soup = BeautifulSoup.BeautifulSoup(content) version = distobj.extract_soup(row.name, soup) elif hasattr(distobj, 'extract_tree'): tree = ElementTree.fromstring(content) version = distobj.extract_tree(row.name, tree) return version def handle_version(self, row, version): parent = self.parent if not version: if parent.verbose: print row.project, row.dist, 'version empty' elif row.version != version: print row.project, row.dist, version row.version = version parent.new = True else: if parent.verbose: print row.project, row.dist, 'checked' def update(self, content): row = self.row try: version = self.get_version(row, content) except: print row.project, row.dist, 'version failed' else: self.handle_version(row, version) def error(self, failure): row = self.row print row.project, row.dist, 'failed --', failure.value if __name__ == '__main__': opts, args = getopt.getopt(sys.argv[1:], 'v') verbose = False for opt, value in opts: if opt == '-v': verbose = True updater = Updater(verbose) infoset = get_info(args) for info in infoset: updater.init(info) updater.update()