summaryrefslogtreecommitdiffstats
path: root/planet/management/commands/update_planet.py
blob: fd74c2910817d5567540279b733f3c4a49cb7446 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
"""
update_planet

Imports all feeds for users who have filled in a valid website and website_rss,
the amount of items imported is limited to the defined FEED_LIMT

Usage: ./manage.py update_planet
"""


import logging
import sys
import time

from datetime import datetime
from pytz import utc

import bleach
import feedparser

from django.core.cache import cache
from django.core.management.base import BaseCommand
from django.template.defaultfilters import truncatewords_html
from django.conf import settings

from planet.models import Feed, FeedItem, FEEDITEM_SUMMARY_LIMIT


logging.basicConfig(
    level=logging.WARNING,
    format=u'%(asctime)s -> %(levelname)s: %(message)s',
    datefmt=u'%Y-%m-%d %H:%M:%S',
    stream=sys.stderr)
logger = logging.getLogger()


class ItemOlderThenLatest(Exception):
    pass


class Command(BaseCommand):

    def parse_feed(self, feed_instance):
        latest = None
        items = []
        url = feed_instance.website_rss

        logger.debug("Import new feed items for '%s'", url)

        try:
            latest = FeedItem.objects.filter(feed=feed_instance).latest()
        except FeedItem.DoesNotExist:
            pass

        etag = cache.get(f'planet:etag:{url}')
        feed = feedparser.parse(url, etag=etag)

        if feed['status'] == 304:
            logger.info("The feed '%s' has not changed since we last checked it", url)
            if 'etag' in feed:
                cache.set(f'planet:etag:{url}', feed.etag, 86400)
            return

        if feed['status'] != 200:
            logger.info("error parsing feed: '%s', status: '%s'", url, feed['status'])
            return

        if not feed.entries:
            logger.info("error parsing feed: '%s', feed has no more new entries", url)
            return

        for entry in feed.entries[:settings.RSS_FEED_LIMIT]:
            try:
                item = self.parse_entry(entry, feed_instance, latest)
            except ItemOlderThenLatest:
                break

            if not item:
                continue

            items.append(item)

        logger.debug("inserting %d feed entries", len(items))
        res = FeedItem.objects.bulk_create(items)

        if res and 'etag' in feed:
            # Cache etag for one day
            logger.debug("cache etag for '%s'", url)
            cache.set(f'planet:etag:{url}', feed.etag, 86400)

    def parse_entry(self, entry, feed_instance, latest):
        url = feed_instance.website_rss
        published_parsed = entry.get('published_parsed')
        # Maybe it's an atom feed?
        if not published_parsed:
            published_parsed = entry.get('updated_parsed')

        if not published_parsed:
            logger.error("feed: '%s' has no published or updated date", url)
            return

        published = datetime.fromtimestamp(time.mktime(published_parsed)).replace(tzinfo=utc)

        if latest and latest.publishdate >= published:
            logger.debug("feed: '%s' has no more new entries", url)
            raise ItemOlderThenLatest()

        if not entry.link:
            logger.error("feed '%s' entry has no link, skipping", url)
            return

        logger.debug("import feed entry '%s'", entry.title)

        item = FeedItem(title=entry.title, publishdate=published, url=entry.link)
        item.feed = feed_instance

        if entry.get('description'):
            summary = bleach.clean(entry.description, strip=True)
            if len(summary) > FEEDITEM_SUMMARY_LIMIT:
                summary = truncatewords_html(summary, 100)
            item.summary = summary

        if entry.get('author'):
            item.author = entry.get('author')

        return item

    def handle(self, *args, **options):
        v = int(options.get('verbosity', 0))
        if v == 0:
            logger.level = logging.ERROR
        elif v == 1:
            logger.level = logging.INFO
        elif v >= 2:
            logger.level = logging.DEBUG

        for feed in Feed.objects.all():
            self.parse_feed(feed)

            # Only keep RSS_FEED_LIMIT amount of feed items.
            feeds_to_keep = FeedItem.objects.filter(feed=feed).order_by('-publishdate')[:settings.RSS_FEED_LIMIT]
            items = FeedItem.objects.filter(feed=feed).exclude(pk__in=feeds_to_keep).delete()
            logger.debug("removed %d feed entries", items[0])

# vim: set ts=4 sw=4 et: