From 5ca2cea388862a507484e0375edf36972cb389ef Mon Sep 17 00:00:00 2001 From: Dan McGee Date: Sun, 12 Sep 2010 12:24:23 -0500 Subject: reporead: revamp database parsing code This needed a little sprucing up as it has grown quite organically over the life of this script. Make things a bit more pythonic through the use of iterators rather than collection indexing, and try to generalize the special cases of things a bit. Also catch encoding problems early and fail gracefully rather than blow up the entire package parser. A failed decode of a file should cause us to just skip it rather than stop the entire parser. Worst case, this leaves that package out of the web interface. Signed-off-by: Dan McGee --- devel/management/commands/reporead.py | 157 ++++++++++++++-------------------- 1 file changed, 64 insertions(+), 93 deletions(-) diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py index fb3c4a75..7c468001 100644 --- a/devel/management/commands/reporead.py +++ b/devel/management/commands/reporead.py @@ -13,19 +13,12 @@ Example: ./manage.py reporead i686 /tmp/core.db.tar.gz """ -# multi value blocks -REPOVARS = ['arch', 'backup', 'base', 'builddate', 'conflicts', 'csize', - 'deltas', 'depends', 'desc', 'filename', 'files', 'force', - 'groups', 'installdate', 'isize', 'license', 'md5sum', - 'name', 'optdepends', 'packager', 'provides', 'reason', - 'replaces', 'size', 'url', 'version'] - - from django.core.management.base import BaseCommand, CommandError from django.contrib.auth.models import User from django.db import transaction from django.db.models import Q +import codecs import os import re import sys @@ -34,7 +27,6 @@ import logging from datetime import datetime from optparse import make_option -from cStringIO import StringIO from logging import ERROR, WARNING, INFO, DEBUG from main.models import Arch, Package, Repo @@ -85,49 +77,36 @@ class Command(BaseCommand): class Pkg(object): """An interim 'container' object for holding Arch package data.""" + bare = ( 'name', 'base', 'arch', 'desc', 'filename', + 'md5sum', 'url', 'builddate', 'packager' ) + squash = ( 'license', ) + number = ( 'csize', 'isize' ) - def __init__(self, val, repo): - selfdict = {} - squash = ['arch', 'builddate', 'csize', 'desc', 'filename', - 'installdate', 'isize', 'license', 'md5sum', - 'packager', 'size', 'url'] - - selfdict['name'] = val['name'][0] - selfdict['base'] = None - del val['name'] - if 'license' not in val: - val['license'] = [] - for x in val.keys(): - if x in squash: - if val[x] == None or len(val[x]) == 0: - logger.warning("Package %s has no %s" % (selfdict['name'], x)) - selfdict[x] = None - else: - selfdict[x] = ', '.join(val[x]) - # make sure we don't have elements larger than the db char - # fields - if len(selfdict[x]) > 255: - selfdict[x] = selfdict[x][:254] - elif x == 'base': - selfdict[x] = val[x][0] - elif x == 'force': - selfdict[x] = True - elif x == 'version': - version = val[x][0].rsplit('-') - selfdict['ver'] = version[0] - selfdict['rel'] = version[1] - elif x == 'reason': - selfdict[x] = int(val[x][0]) - else: - selfdict[x] = val[x] - self.__dict__ = selfdict + def __init__(self, repo): self.repo = repo - - def __getattr__(self, name): - if name == 'force': - return False - else: - return None + self.ver = None + self.rel = None + for k in self.bare + self.squash + self.number: + setattr(self, k, None) + + def populate(self, values): + for k, v in values.iteritems(): + # ensure we stay under our DB character limit + if k in self.bare: + setattr(self, k, v[0][:254]) + elif k in self.squash: + setattr(self, k, u', '.join(v)[:254]) + elif k in self.number: + setattr(self, k, long(v[0])) + elif k == 'force': + setattr(self, k, True) + elif k == 'version': + ver, rel = v[0].rsplit('-') + setattr(self, 'ver', ver) + setattr(self, 'rel', rel) + else: + # files, depends, etc. + setattr(self, k, v) def find_user(userstring): @@ -187,8 +166,8 @@ def populate_pkg(dbpkg, repopkg, force=False, timestamp=None): dbpkg.license = repopkg.license dbpkg.url = repopkg.url dbpkg.filename = repopkg.filename - dbpkg.compressed_size = int(repopkg.csize) - dbpkg.installed_size = int(repopkg.isize) + dbpkg.compressed_size = repopkg.csize + dbpkg.installed_size = repopkg.isize try: dbpkg.build_date = datetime.utcfromtimestamp(int(repopkg.builddate)) except ValueError: @@ -335,33 +314,24 @@ def db_update(archname, reponame, pkgs, options): logger.info('Finished updating Arch: %s' % archname) -def parse_inf(iofile): +def parse_info(iofile): """ Parses an Arch repo db information file, and returns variables as a list. - - Arguments: - iofile -- A StringIO, FileType, or other object with readlines method. - """ store = {} - lines = iofile.readlines() blockname = None - max_len = len(lines) - i = 0 - while i < max_len: - line = lines[i].strip() - if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS: + for line in iofile: + line = line.strip() + if len(line) == 0: + continue + elif line.startswith('%') and line.endswith('%'): blockname = line[1:-1].lower() logger.debug("Parsing package block %s", blockname) store[blockname] = [] - i += 1 - while i < max_len and len(lines[i].strip()) > 0: - store[blockname].append(lines[i].strip()) - i += 1 - # here is where i would convert arrays to strings - # based on count and type, but i dont think it is needed now - i += 1 - + elif blockname: + store[blockname].append(line) + else: + raise Exception("Read package info outside a block: %s" % line) return store @@ -391,29 +361,30 @@ def parse_repo(repopath): ## repo-add enforces this logger.debug("Starting package parsing") dbfiles = ('desc', 'depends', 'files') - pkgs = [] - tpkg = None - while True: - tarinfo = repodb.next() - if tarinfo == None or tarinfo.isdir(): - if tpkg != None: - tpkg.reset() - data = parse_inf(tpkg) - p = Pkg(data, reponame) - logger.debug("Done parsing package %s", p.name) - pkgs.append(p) - if tarinfo == None: - break - # set new tpkg - tpkg = StringIO() - if tarinfo.isreg(): - fname = os.path.split(tarinfo.name)[1] - if fname in dbfiles: - tpkg.write(repodb.extractfile(tarinfo).read()) - tpkg.write('\n') # just in case + pkgs = {} + for tarinfo in repodb.getmembers(): + if tarinfo.isdir(): + continue + elif tarinfo.isreg(): + pkgid, fname = os.path.split(tarinfo.name) + if fname not in dbfiles: + continue + data_file = repodb.extractfile(tarinfo) + data_file = codecs.EncodedFile(data_file, 'utf-8') + try: + data = parse_info(data_file) + p = pkgs.setdefault(pkgid, Pkg(reponame)) + p.populate(data) + except UnicodeDecodeError, e: + logger.warn("Could not correctly decode %s, skipping file" % \ + tarinfo.name) + data_file.close() + + logger.debug("Done parsing file %s", fname) + repodb.close() - logger.info("Finished repo parsing") - return (reponame, pkgs) + logger.info("Finished repo parsing, %d total packages" % len(pkgs)) + return (reponame, pkgs.values()) def validate_arch(arch): "Check if arch is valid." -- cgit v1.2.3-55-g3dc8