summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorDan McGee <dan@archlinux.org>2011-02-23 09:46:54 -0600
committerDan McGee <dan@archlinux.org>2011-02-23 09:46:54 -0600
commitf6c41b273c8962718b303c6050c2fd8bcea533a8 (patch)
treea6ec976a0f225852faa74bbb0955035f228ba78e
parentdfc4d919f1b0349d5143764c3f8f62c240e50623 (diff)
downloadarchweb-f6c41b273c8962718b303c6050c2fd8bcea533a8.tar.gz
archweb-f6c41b273c8962718b303c6050c2fd8bcea533a8.zip
reporead performance improvements
When importing over a million files, it makes sense to take the slightly faster route and call the PackageFile() constructor directly rather than going through the related manager's create method. We can also get huge performance improvements, especially with files databases, by using the 'io' rather than 'codecs' module. The former is now implemented in C in 2.7 and results in a no-work import (so measuring only the DB read speed) of extra.files.tar.gz from ~30 seconds to ~5 seconds. Signed-off-by: Dan McGee <dan@archlinux.org>
-rw-r--r--devel/management/commands/reporead.py21
1 files changed, 18 insertions, 3 deletions
diff --git a/devel/management/commands/reporead.py b/devel/management/commands/reporead.py
index 72595c63..bda3bd61 100644
--- a/devel/management/commands/reporead.py
+++ b/devel/management/commands/reporead.py
@@ -27,9 +27,17 @@ import logging
from datetime import datetime
from optparse import make_option
+# New in 2.6, but fast (C implementation) in 2.7. We will use it over codecs if
+# available. Eventually remove the codecs import completely.
+io = None
+try:
+ import io
+except ImportError:
+ pass
+
from logging import ERROR, WARNING, INFO, DEBUG
-from main.models import Arch, Package, PackageDepend, Repo
+from main.models import Arch, Package, PackageDepend, PackageFile, Repo
logging.basicConfig(
level=WARNING,
@@ -241,10 +249,13 @@ def populate_files(dbpkg, repopkg, force=False):
dirname, filename = f.rsplit('/', 1)
if filename == '':
filename = None
- dbpkg.packagefile_set.create(
+ # this is basically like calling dbpkg.packagefile_set.create(),
+ # but much faster as we can skip a lot of the repeated code paths
+ pkgfile = PackageFile(pkg=dbpkg,
is_directory=(filename is None),
directory=dirname + '/',
filename=filename)
+ pkgfile.save()
dbpkg.files_last_update = datetime.now()
dbpkg.save()
@@ -394,7 +405,11 @@ def parse_repo(repopath):
if fname not in dbfiles:
continue
data_file = repodb.extractfile(tarinfo)
- data_file = codecs.EncodedFile(data_file, 'utf-8')
+ if io is None:
+ data_file = codecs.EncodedFile(data_file, 'utf-8')
+ else:
+ data_file = io.TextIOWrapper(io.BytesIO(data_file.read()),
+ encoding='utf=8')
try:
data = parse_info(data_file)
p = pkgs.setdefault(pkgid, Pkg(reponame))