summaryrefslogtreecommitdiffstats
path: root/devel
diff options
context:
space:
mode:
authorDan McGee <dan@archlinux.org>2010-02-10 21:28:49 -0600
committerDan McGee <dan@archlinux.org>2010-02-10 21:30:35 -0600
commitc1697ab694fe549d7b6ff81a00737a2ad63e9461 (patch)
tree476b51edc0a535455d9f5e852ee6879b4b2f1e43 /devel
parent25ce92969ac9e9c7f02f42470c2d019b630b958a (diff)
downloadarchweb-c1697ab694fe549d7b6ff81a00737a2ad63e9461.tar.gz
archweb-c1697ab694fe549d7b6ff81a00737a2ad63e9461.zip
reporead: turn into a django-admin command
Rather than struggle with getting the environment set up, let's make this a custom Django admin command and use the flexibility that gives us. This is the initial rough cut of making it happen; further commits should clean up some of the rough edges. Signed-off-by: Dan McGee <dan@archlinux.org>
Diffstat (limited to 'devel')
-rw-r--r--devel/management/__init__.py0
-rw-r--r--devel/management/commands/__init__.py0
-rwxr-xr-xdevel/management/commands/reporead.py339
3 files changed, 339 insertions, 0 deletions
diff --git a/devel/management/__init__.py b/devel/management/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/devel/management/__init__.py
diff --git a/devel/management/commands/__init__.py b/devel/management/commands/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/devel/management/commands/__init__.py
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py
new file mode 100755
index 00000000..b53e259c
--- /dev/null
+++ b/devel/management/commands/reporead.py
@@ -0,0 +1,339 @@
+# -*- coding: utf-8 -*-
+"""
+reporead command
+
+Parses a repo.db.tar.gz file and updates the Arch database with the relevant
+changes.
+
+Usage: ./manage.py reporead ARCH PATH
+ ARCH: architecture to update, and can be one of: i686, x86_64
+ PATH: full path to the repo.db.tar.gz file.
+
+Example:
+ ./manage.py reporead i686 /tmp/core.db.tar.gz
+"""
+
+# multi value blocks
+REPOVARS = ['arch', 'backup', 'base', 'builddate', 'conflicts', 'csize',
+ 'deltas', 'depends', 'desc', 'filename', 'files', 'force',
+ 'groups', 'installdate', 'isize', 'license', 'md5sum',
+ 'name', 'optdepends', 'packager', 'provides', 'reason',
+ 'replaces', 'size', 'url', 'version']
+
+
+from django.core.management.base import BaseCommand
+from django.conf import settings
+from django.db import models, transaction
+from django.core import management
+
+import os
+import re
+import sys
+import tarfile
+import logging
+from datetime import datetime
+from optparse import make_option
+
+from cStringIO import StringIO
+from logging import WARNING,INFO,DEBUG
+
+from main.models import Arch, Package, Repo
+
+class SomethingFishyException(Exception):
+ '''Raised when the database looks like its going to wipe out a bunch of
+ packages.'''
+ pass
+
+logging.basicConfig(
+ level=WARNING,
+ format='%(asctime)s -> %(levelname)s: %(message)s',
+ datefmt='%Y-%m-%d %H:%M:%S',
+ stream=sys.stderr)
+logger = logging.getLogger()
+
+class Command(BaseCommand):
+ option_list = BaseCommand.option_list
+
+ def handle(self, arch=None, file=None, **options):
+ logger.level = INFO
+ if arch == None or file == None:
+ usage()
+ return 0
+ file = os.path.normpath(file)
+ read_repo(arch, file)
+
+
+class Pkg(object):
+ """An interim 'container' object for holding Arch package data."""
+
+ def __init__(self, val):
+ selfdict = {}
+ squash = ['arch', 'builddate', 'csize', 'desc', 'filename',
+ 'installdate', 'isize', 'license', 'md5sum',
+ 'packager', 'size', 'url']
+
+ selfdict['name'] = val['name'][0]
+ selfdict['base'] = None
+ del val['name']
+ if 'desc' not in val:
+ logger.warning("Package %s has no description" % selfdict['name'])
+ val['desc'] = None
+ if 'url' not in val:
+ val['url'] = None
+ if 'license' not in val:
+ val['license'] = []
+ for x in val.keys():
+ if x in squash:
+ if val[x] == None or len(val[x]) == 0:
+ logger.warning("Package %s has no %s" % (selfdict['name'],x))
+ selfdict[x] = None
+ else:
+ selfdict[x] = ', '.join(val[x])
+ # make sure we don't have elements larger than the db char
+ # fields
+ if len(selfdict[x]) > 255:
+ selfdict[x] = selfdict[x][:254]
+ elif x == 'base':
+ selfdict[x] = val[x][0]
+ elif x == 'force':
+ selfdict[x] = True
+ elif x == 'version':
+ version = val[x][0].rsplit('-')
+ selfdict['ver'] = version[0]
+ selfdict['rel'] = version[1]
+ elif x == 'reason':
+ selfdict[x] = int(val[x][0])
+ else:
+ selfdict[x] = val[x]
+ self.__dict__ = selfdict
+
+ def __getattr__(self,name):
+ if name == 'force':
+ return False
+ else:
+ return None
+
+
+def usage():
+ """Print the usage of this application."""
+ print __doc__.strip()
+
+
+def populate_pkg(dbpkg, repopkg, timestamp=None):
+ if not timestamp: timestamp = datetime.now()
+ dbpkg.pkgbase = repopkg.base
+ dbpkg.pkgver = repopkg.ver
+ dbpkg.pkgrel = repopkg.rel
+ dbpkg.pkgdesc = repopkg.desc
+ dbpkg.license = repopkg.license
+ dbpkg.url = repopkg.url
+ dbpkg.needupdate = False
+ dbpkg.last_update = timestamp
+ dbpkg.save()
+ # files are not in the repo.db.tar.gz
+ #for x in repopkg.files:
+ # dbpkg.packagefile_set.create(path=x)
+ dbpkg.packagedepend_set.all().delete()
+ if 'depends' in repopkg.__dict__:
+ for y in repopkg.depends:
+ # make sure we aren't adding self depends..
+ # yes *sigh* i have seen them in pkgbuilds
+ dpname,dpvcmp = re.match(r"([a-z0-9._+-]+)(.*)", y).groups()
+ if dpname == repopkg.name:
+ logger.warning('Package %s has a depend on itself' % repopkg.name)
+ continue
+ dbpkg.packagedepend_set.create(depname=dpname, depvcmp=dpvcmp)
+ logger.debug('Added %s as dep for pkg %s' % (dpname,repopkg.name))
+
+
+def db_update(archname, pkgs):
+ """
+ Parses a list and updates the Arch dev database accordingly.
+
+ Arguments:
+ pkgs -- A list of Pkg objects.
+
+ """
+ logger.info('Updating Arch: %s' % archname)
+ repository = Repo.objects.get(name__iexact=pkgs[0].repo)
+ architecture = Arch.objects.get(name__iexact=archname)
+ dbpkgs = Package.objects.filter(arch=architecture, repo=repository)
+ # It makes sense to fully evaluate our DB query now because we will
+ # be using 99% of the objects in our "in both sets" loop. Force eval
+ # by calling list() on the QuerySet.
+ list(dbpkgs)
+ # This makes our inner loop where we find packages by name *way* more
+ # efficient by not having to go to the database for each package to
+ # SELECT them by name.
+ dbdict = dict([(pkg.pkgname, pkg) for pkg in dbpkgs])
+ now = datetime.now()
+
+ # go go set theory!
+ # thank you python for having a set class <3
+ logger.debug("Creating sets")
+ dbset = set([pkg.pkgname for pkg in dbpkgs])
+ syncset = set([pkg.name for pkg in pkgs])
+ logger.info("%d packages in current web DB" % len(dbset))
+ logger.info("%d packages in new updating db" % len(syncset))
+ # packages in syncdb and not in database (add to database)
+ logger.debug("Set theory: Packages in syncdb not in database")
+ in_sync_not_db = syncset - dbset
+ logger.info("%d packages in sync not db" % len(in_sync_not_db))
+
+ # Try to catch those random orphaning issues that make Eric so unhappy.
+ if len(dbset) > 20:
+ dbpercent = 100.0 * len(syncset) / len(dbset)
+ else:
+ # we don't have 20 packages in this repo/arch, so this check could
+ # produce a lot of false positives (or a div by zero). fake it
+ dbpercent = 100.0
+ logger.info("DB package ratio: %.1f%%" % dbpercent)
+ if dbpercent < 50.0 and repository.name.lower().find('testing') == -1:
+ logger.error(".db.tar.gz has %.1f%% the number of packages in the web database" % dbpercent)
+ raise SomethingFishyException(
+ 'It looks like the syncdb is less than half the size of the web db. WTF?')
+
+ if dbpercent < 75.0:
+ logger.warning(".db.tar.gz has %.1f%% the number of packages in the web database." % dbpercent)
+
+ for p in [x for x in pkgs if x.name in in_sync_not_db]:
+ logger.info("Adding package %s", p.name)
+ pkg = Package(pkgname = p.name, arch = architecture, repo = repository)
+ populate_pkg(pkg, p, timestamp=now)
+
+ # packages in database and not in syncdb (remove from database)
+ logger.debug("Set theory: Packages in database not in syncdb")
+ in_db_not_sync = dbset - syncset
+ for p in in_db_not_sync:
+ logger.info("Removing package %s from database", p)
+ Package.objects.get(
+ pkgname=p, arch=architecture, repo=repository).delete()
+
+ # packages in both database and in syncdb (update in database)
+ logger.debug("Set theory: Packages in database and syncdb")
+ pkg_in_both = syncset & dbset
+ for p in [x for x in pkgs if x.name in pkg_in_both]:
+ logger.debug("Looking for package updates")
+ dbp = dbdict[p.name]
+ if ''.join((p.ver,p.rel)) == ''.join((dbp.pkgver,dbp.pkgrel)):
+ continue
+ logger.info("Updating package %s in database", p.name)
+ pkg = Package.objects.get(
+ pkgname=p.name,arch=architecture, repo=repository)
+ populate_pkg(pkg, p, timestamp=now)
+
+ logger.info('Finished updating Arch: %s' % archname)
+
+
+def parse_inf(iofile):
+ """
+ Parses an Arch repo db information file, and returns variables as a list.
+
+ Arguments:
+ iofile -- A StringIO, FileType, or other object with readlines method.
+
+ """
+ store = {}
+ lines = iofile.readlines()
+ blockname = None
+ max = len(lines)
+ i = 0
+ while i < max:
+ line = lines[i].strip()
+ if len(line) > 0 and line[0] == '%' and line[1:-1].lower() in REPOVARS:
+ blockname = line[1:-1].lower()
+ logger.debug("Parsing package block %s",blockname)
+ store[blockname] = []
+ i += 1
+ while i < max and len(lines[i].strip()) > 0:
+ store[blockname].append(lines[i].strip())
+ i += 1
+ # here is where i would convert arrays to strings
+ # based on count and type, but i dont think it is needed now
+ i += 1
+
+ return store
+
+
+def parse_repo(repopath):
+ """
+ Parses an Arch repo db file, and returns a list of Pkg objects.
+
+ Arguments:
+ repopath -- The path of a repository db file.
+
+ """
+ logger.info("Starting repo parsing")
+ if not os.path.exists(repopath):
+ logger.error("Could not read file %s", repopath)
+
+ logger.info("Reading repo tarfile %s", repopath)
+ filename = os.path.split(repopath)[1]
+ rindex = filename.rindex('.db.tar.gz')
+ reponame = filename[:rindex]
+
+ repodb = tarfile.open(repopath,"r:gz")
+ ## assuming well formed tar, with dir first then files after
+ ## repo-add enforces this
+ logger.debug("Starting package parsing")
+ pkgs = []
+ tpkg = None
+ while True:
+ tarinfo = repodb.next()
+ if tarinfo == None or tarinfo.isdir():
+ if tpkg != None:
+ tpkg.reset()
+ data = parse_inf(tpkg)
+ p = Pkg(data)
+ p.repo = reponame
+ logger.debug("Done parsing package %s", p.name)
+ pkgs.append(p)
+ if tarinfo == None:
+ break
+ # set new tpkg
+ tpkg = StringIO()
+ if tarinfo.isreg():
+ if os.path.split(tarinfo.name)[1] in ('desc','depends'):
+ tpkg.write(repodb.extractfile(tarinfo).read())
+ tpkg.write('\n') # just in case
+ repodb.close()
+ logger.info("Finished repo parsing")
+ return pkgs
+
+@transaction.commit_on_success
+def read_repo(arch, file):
+ """
+ Parses repo.db.tar.gz file and returns exit status.
+ """
+ # check if arch is valid
+ available_arches = [x.name for x in Arch.objects.all()]
+ if arch not in available_arches:
+ usage()
+ return 0
+ else:
+ primary_arch = arch
+
+ packages = parse_repo(file)
+
+ # sort packages by arch -- to handle noarch stuff
+ packages_arches = {}
+ for arch in available_arches:
+ packages_arches[arch] = []
+
+ for package in packages:
+ if package.arch in ('any', primary_arch):
+ packages_arches[package.arch].append(package)
+ else:
+ logger.warning("Package %s arch = %s" % (
+ package.name,package.arch))
+ #package.arch = primary_arch
+
+
+ logger.info('Starting database updates.')
+ for (arch, pkgs) in packages_arches.iteritems():
+ if len(pkgs) > 0:
+ db_update(arch,pkgs)
+ logger.info('Finished database updates.')
+ return 0
+
+# vim: set ts=4 sw=4 et: