summaryrefslogtreecommitdiffstats
path: root/mirrors
diff options
context:
space:
mode:
authorDan McGee <dan@archlinux.org>2013-04-14 13:09:41 -0500
committerDan McGee <dan@archlinux.org>2013-04-14 13:12:13 -0500
commit213aa3a2fab6f3a56be348a067c132f568efbaff (patch)
treef94ea0dbb71c51b38f09203a07b072e4b2af6d2b /mirrors
parent7fc8da7d959556b1204b7864959e73e7f5f5ec59 (diff)
downloadarchweb-213aa3a2fab6f3a56be348a067c132f568efbaff.tar.gz
archweb-213aa3a2fab6f3a56be348a067c132f568efbaff.zip
Reduce mirror status query madness
Move completely to custom SQL for this logic. The Django ORM just doesn't play nice with the kind of query we are looking to do, so it is easier to do using raw SQL. The biggest pain factor here is in supporting sqlite as it doesn't have nearly the capabilities in handling datetime types directly in the database, as well as having some different type conversion necessities. Signed-off-by: Dan McGee <dan@archlinux.org>
Diffstat (limited to 'mirrors')
-rw-r--r--mirrors/utils.py146
1 files changed, 91 insertions, 55 deletions
diff --git a/mirrors/utils.py b/mirrors/utils.py
index d18dc22f..eb1211f1 100644
--- a/mirrors/utils.py
+++ b/mirrors/utils.py
@@ -2,6 +2,7 @@ from datetime import timedelta
from django.db import connection
from django.db.models import Avg, Count, Max, Min, StdDev
+from django.utils.dateparse import parse_datetime
from django.utils.timezone import now
from django_countries.fields import Country
@@ -11,47 +12,103 @@ from .models import MirrorLog, MirrorProtocol, MirrorUrl
DEFAULT_CUTOFF = timedelta(hours=24)
-def annotate_url(url, delay):
- '''Given a MirrorURL object, add a few more attributes to it regarding
- status, including completion_pct, delay, and score.'''
- url.completion_pct = float(url.success_count) / url.check_count
- if delay is not None:
- url.delay = delay
- hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0
- if url.completion_pct > 0:
- divisor = url.completion_pct
- else:
- # arbitrary small value
- divisor = 0.005
- url.score = (hours + url.duration_avg + url.duration_stddev) / divisor
- else:
- url.delay = None
- url.score = None
+def dictfetchall(cursor):
+ "Returns all rows from a cursor as a dict."
+ desc = cursor.description
+ return [
+ dict(zip([col[0] for col in desc], row))
+ for row in cursor.fetchall()
+ ]
-def url_delays(cutoff_time, mirror_id=None):
- cursor = connection.cursor()
- if mirror_id is None:
- sql= """
-SELECT url_id, AVG(check_time - last_sync)
-FROM mirrors_mirrorlog
-WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL
-GROUP BY url_id
+def status_data(cutoff_time, mirror_id=None):
+ if mirror_id is not None:
+ params = [cutoff_time, mirror_id]
+ mirror_where = 'AND u.mirror_id = %s'
+ else:
+ params = [cutoff_time]
+ mirror_where = ''
+
+ vendor = database_vendor(MirrorUrl)
+ if vendor == 'sqlite':
+ sql = """
+SELECT l.url_id, u.mirror_id,
+ COUNT(l.id) AS check_count,
+ COUNT(l.duration) AS success_count,
+ MAX(l.last_sync) AS last_sync,
+ MAX(l.check_time) AS last_check,
+ AVG(l.duration) AS duration_avg,
+ 0.0 AS duration_stddev,
+ AVG(STRFTIME('%%s', check_time) - STRFTIME('%%s', last_sync)) AS delay
+FROM mirrors_mirrorlog l
+JOIN mirrors_mirrorurl u ON u.id = l.url_id
+WHERE l.check_time >= %s
+""" + mirror_where + """
+GROUP BY l.url_id, u.mirror_id
"""
- cursor.execute(sql, [True, cutoff_time])
else:
sql = """
-SELECT l.url_id, avg(check_time - last_sync)
+SELECT l.url_id, u.mirror_id,
+ COUNT(l.id) AS check_count,
+ COUNT(l.duration) AS success_count,
+ MAX(l.last_sync) AS last_sync,
+ MAX(l.check_time) AS last_check,
+ AVG(l.duration) AS duration_avg,
+ STDDEV(l.duration) AS duration_stddev,
+ AVG(check_time - last_sync) AS delay
FROM mirrors_mirrorlog l
JOIN mirrors_mirrorurl u ON u.id = l.url_id
-WHERE is_success = %s AND check_time >= %s AND last_sync IS NOT NULL
-AND mirror_id = %s
-GROUP BY url_id
+WHERE l.check_time >= %s
+""" + mirror_where + """
+GROUP BY l.url_id, u.mirror_id
"""
- cursor.execute(sql, [True, cutoff_time, mirror_id])
- return {url_id: delay for url_id, delay in cursor.fetchall()}
+ cursor = connection.cursor()
+ cursor.execute(sql, params)
+ url_data = dictfetchall(cursor)
+
+ # sqlite loves to return less than ideal types
+ if vendor == 'sqlite':
+ for item in url_data:
+ item['delay'] = timedelta(seconds=item['delay'])
+ item['last_sync'] = parse_datetime(item['last_sync'])
+ item['last_check'] = parse_datetime(item['last_check'])
+
+ return {item['url_id']: item for item in url_data}
+
+
+def annotate_url(url, url_data):
+ '''Given a MirrorURL object, add a few more attributes to it regarding
+ status, including completion_pct, delay, and score.'''
+ known_attrs = (
+ ('success_count', 0),
+ ('check_count', 0),
+ ('completion_pct', None),
+ ('last_check', None),
+ ('last_sync', None),
+ ('delay', None),
+ ('score', None),
+ )
+ for k, v in known_attrs:
+ setattr(url, k, v)
+ for k, v in url_data.items():
+ if k not in ('url_id', 'mirror_id'):
+ setattr(url, k, v)
+
+ if url.check_count > 0:
+ url.completion_pct = float(url.success_count) / url.check_count
+
+ if url.delay is not None:
+ hours = url.delay.days * 24.0 + url.delay.seconds / 3600.0
+
+ if url.completion_pct > 0:
+ divisor = url.completion_pct
+ else:
+ # arbitrary small value
+ divisor = 0.005
+ stddev = url.duration_stddev or 0.0
+ url.score = (hours + url.duration_avg + stddev) / divisor
@cache_function(123)
@@ -65,29 +122,14 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None):
if mirror_id:
valid_urls = valid_urls.filter(mirror_id=mirror_id)
- url_data = MirrorUrl.objects.values('id', 'mirror_id').filter(
- id__in=valid_urls, logs__check_time__gte=cutoff_time).annotate(
- check_count=Count('logs'),
- success_count=Count('logs__duration'),
- last_sync=Max('logs__last_sync'),
- last_check=Max('logs__check_time'),
- duration_avg=Avg('logs__duration'))
-
- vendor = database_vendor(MirrorUrl)
- if vendor != 'sqlite':
- url_data = url_data.annotate(duration_stddev=StdDev('logs__duration'))
-
+ url_data = status_data(cutoff_time, mirror_id)
urls = MirrorUrl.objects.select_related('mirror', 'protocol').filter(
id__in=valid_urls).order_by('mirror__id', 'url')
- delays = url_delays(cutoff_time, mirror_id)
if urls:
- url_data = dict((item['id'], item) for item in url_data)
for url in urls:
- for k, v in url_data.get(url.id, {}).items():
- if k not in ('id', 'mirror_id'):
- setattr(url, k, v)
- last_check = max([u.last_check for u in urls])
+ annotate_url(url, url_data.get(url.id, {}))
+ last_check = max([u.last_check for u in urls if u.last_check])
num_checks = max([u.check_count for u in urls])
check_info = MirrorLog.objects.filter(check_time__gte=cutoff_time)
if mirror_id:
@@ -104,12 +146,6 @@ def get_mirror_statuses(cutoff=DEFAULT_CUTOFF, mirror_id=None):
num_checks = 0
check_frequency = None
- for url in urls:
- # fake the standard deviation for local testing setups
- if vendor == 'sqlite':
- setattr(url, 'duration_stddev', 0.0)
- annotate_url(url, delays.get(url.id, None))
-
return {
'cutoff': cutoff,
'last_check': last_check,