Source code for ferenda.sources.legal.se.propositioner

# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
                        print_function, unicode_literals)
from builtins import *
from future import standard_library
standard_library.install_aliases()

import re
import os
from datetime import datetime
from collections import OrderedDict, Counter
import codecs
from urllib.parse import urljoin
import json
import tempfile
import filecmp

from bs4 import BeautifulSoup
from lxml import etree
import lxml.html
import requests
from layeredconfig import LayeredConfig
from cached_property import cached_property
from rdflib import Literal, URIRef
from rdflib.namespace import DCTERMS

from ferenda import util, decorators
from ferenda.elements import Preformatted, Body
from ferenda import CompositeRepository, CompositeStore
from ferenda import TextReader, PDFAnalyzer
from ferenda import DocumentEntry, Facet, PDFDocumentRepository
from ferenda.pdfreader import StreamingPDFReader, Textbox
from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
               SwedishLegalSource, SwedishLegalStore, RPUBL, Offtryck)
from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource
from .swedishlegalsource import lazyread, SwedishLegalStore
from .elements import Sidbrytning

def prop_sanitize_identifier(identifier):
    if not identifier:
        return identifier # allow infer_identifier to do it's magic later
    if identifier.startswith("prop"):
        identifier = util.ucfirst(identifier)
    if identifier.startswith("PROP"):
        identifier = identifier.replace("PROP", "Prop")
    if identifier.startswith("Prop "):
        identifier = identifier.replace("Prop ", "Prop. ")
    if re.match("Prop\.\d{4}", identifier): # missing space
        identifier = identifier.replace("Prop.", "Prop. ")
    if "\xa0" in identifier: # Non-breakable space
        identifier = identifier.replace("\xa0", " ")
    if not identifier.startswith("Prop. "):
        identifier = "Prop. " + identifier
    # identify and correct the not-uncommon "2009/2010:87" pattern (should be 2009/10:87)
    m = re.search(r"(\d{4})/(\d{4}):(\d+)$", identifier)
    if m and m.group(2) != "2000" and int(m.group(1)) == int(m.group(2)) - 1:
        identifier = identifier.replace(m.group(2), m.group(2)[-2:])
    if not re.match(r"^Prop\. (19|20)\d{2}(|/\d{2}|/2000):(|B ?|U ?)[1-9]\d{0,2}$", identifier):
        raise ValueError("Irregular identifier %s" % identifier)
    return Literal(identifier)

class PropAnalyzer(PDFAnalyzer):

    # NOTE: The cutoff used to be 0.5% but it turns out that in
    # particular h2's can be quite rare, occuring maybe two times
    # in an entire document.
    style_significance_threshold = 0.001

    @cached_property
    def documents(self):
        def boxmatch(page, textpattern, bb=None):
            if bb is None:
                bb = page.boundingbox(bottom=page.height / 5)
            for box in bb:
                m = re.match(textpattern, str(box))
                if m:
                    return m.group(1)
            return None
        documents = []
        mainstyles = Counter()
        pagedims = {'pagewidth': util.TopCounter(),
                    'pageheight': util.TopCounter()}
        currentappendix = None
        for pageidx, page in enumerate(self.pdf):
            styles = self.count_styles(pageidx, 1)
            # find the most dominant style on the page. If it uses the
            # EU font, it's a separate section.
            if styles and styles.most_common(1)[0][0][0].startswith("EUAlbertina"):
                currentdoc = 'eudok'
                currentappendix = boxmatch(page, "Bilaga (\d)\s*$")
            else:
                # if there is a text box matching "Bilaga \d" in top
                # margin and the bilagenummer is new and dominant
                # style (family) is different from any of the
                # top 3 currrent dominant styles:
                #
                # NOTE that normally we want to treat appendicies as
                # part of the regular text (so that
                # offtryck_parser.is_appendix et al can do their
                # thing. This heuristic should only catch appendicies
                # that are v. different.
                appendix = boxmatch(page, "Bilaga (\d)\s*$")
                if (appendix and
                    appendix != currentappendix and
                    styles.most_common(1) and 
                    styles.most_common(1)[0][0][0] not in [x[0][0] for x in mainstyles.most_common(3)]):
                    currentdoc = 'appendix'
                elif ".hocr." in self.pdf.filename:
                    # scanned sources have fluctuating page sizes,
                    # plus it's not possible to identify appendicies
                    # by differing page dimensions
                    currentdoc = "main"
                elif pageidx == 0 and boxmatch(page, "(REGERINGENS PROPOSITION)", page.boundingbox(top=page.height * 0.8)):
                    currentdoc = "frontmatter"
                else:
                    if (pagedims['pageheight'] and
                        (abs(pagedims['pageheight'].top() - page.height) > 1 or
                         abs(pagedims['pagewidth'].top() - page.width) > 1)):
                        # if the page dimensions suddenly change,
                        # that's a dead giveaway that some external
                        # appendix has been lifted right into the PDF
                        #
                        # But in some cases dimension change does NOT
                        # mean external appendix. In Prop 2015/16:195,
                        # which is split in 4 pdfs (2 logical volumes)
                        # it's just an artifact due to the 2nd pdf
                        # being properly cropped while the 1st
                        # isn't. In prop 2008/09:140, which
                        # uncharacteristically includes frontmatter, a
                        # dimension change signals the change from
                        # frontmatter to main
                        if currentdoc == "frontmatter":
                            currentdoc = "main"
                        else:
                            currentdoc = 'appendix'
                    else:
                        currentdoc = 'main'
                        currentappendix = appendix
            if currentdoc == "main":
                mainstyles += styles
                pagedims['pagewidth'][page.width] += 1
                pagedims['pageheight'][page.height] += 1
            # update the current document segment tuple or start a new one
            if documents and documents[-1][2] == currentdoc:
                documents[-1][1] += 1
            else:
                documents.append([pageidx, 1, currentdoc])
        return documents

    def guess_pagenumber_select(self, candidates, probable_pagenumber):
        if self.scanned_source:
            # try to avoid assuming that smudges and crap equals
            # lower-case L and other things that might be interpreted
            # as roman numeral
            if util.is_roman(candidates[0]) and str(probable_pagenumber) == "1":
                return 1  # Do not interpret a single 'l' as roman 50
                          # -- it's probably a badly OCR:ed '
            else:
                # be a little more conservative with what a good guess
                # is compared to PDFAnalyzer.guess_pagenumber_select:
                # only accept the smallest candidate larger-or-equal
                # to the probable_pagenumber -- but not if it's a
                # too-large gap. Also, assume no roman numerals
                try:
                    return next(c for c in sorted(candidates) if c >= probable_pagenumber and c <= probable_pagenumber * 2)
                except StopIteration: # no suitable candidate
                    return None
                
        # otherwise fall back to superclass implementation
        return super(PropAnalyzer, self).guess_pagenumber_select(candidates, probable_pagenumber)
                          
    def guess_pagenumber_boxes(self, page):
        """Return a suitable number of textboxes to scan for a possible page number. """
        if self.scanned_source:
            # For scanned source, the default strategy works so-so
            # (many OCR errors may result in misinterpreting things as
            # pagenumbers) so we also take into account the text box
            # property. Only select thin boxes (less than 1/50th of
            # the page width) -- page numbers should stand by
            # themselves and naturally be pretty thin
            return [b for b in list(reversed(page))[:5] + list(page)[:5] if b.width < page.width/50]
        else:
            return super(PropAnalyzer, self).guess_pagenumber_boxes(page)


    def metrics(self, metricspath=None, plotpath=None, startpage=0,
                pagecount=None, force=False):
        docsegments = self.documents
        if len(docsegments) == 1:
            return super(PropAnalyzer, self).metrics(metricspath,
                                                     plotpath,
                                                     startpage,
                                                     pagecount, force)
        else:
            r = []
            exclude = []
            mainidx = None
            for idx, (startpage, pagecount, tag) in enumerate(docsegments):
                r.append(super(PropAnalyzer,
                                 self).metrics(startpage=startpage,
                                               pagecount=pagecount))
                if tag != 'main':
                    exclude.extend(list(range(startpage, startpage+pagecount)))
                elif mainidx is None:
                    mainidx = idx
        r[mainidx]['excludedpages'] = exclude
        # since we don't pass metricspath to super().metrics, that
        # func does not create a metrics.json cache file. So we
        # generate that now (using the same data as we return)
        util.ensure_dir(metricspath)
        with open(metricspath, "w") as fp:
            s = json.dumps(r[mainidx], indent=4, separators=(', ', ': '), sort_keys=True)
            fp.write(s)
        return r[mainidx]

    def count_styles(self, startpage, pagecount):
        # we should avoid counting the styles on the front page, as
        # that page uses a title font, not used anywhere else in the
        # document, which is then mistaken for the h1 font.
        if not startpage:
            startpage = 1
        return super(PropAnalyzer, self).count_styles(startpage, pagecount)

[docs]class PropRegeringen(Regeringen):
    alias = "propregeringen"
    re_basefile_strict = re.compile(r'Prop. (\d{4}/\d{2,4}:\d+)')
    re_basefile_lax = re.compile(
        r'(?:Prop\.?|) ?(\d{4}/\d{2,4}:\d+)', re.IGNORECASE)
    re_urlbasefile_strict = re.compile("proposition/\d+/\d+/[a-z]*\.?-?(\d{6})(\d+)-?/$")
    re_urlbasefile_lax = re.compile("proposition/\d+/\d+/.*?(\d{4}_?\d{2})[_-]?(\d+)")
    rdf_type = RPUBL.Proposition
    document_type = Regeringen.PROPOSITION
    # sparql_annotations = "sparql/prop-annotations.rq"

    def attribs_from_url(self, url):
        attribs = super(PropRegeringen, self).attribs_from_url(url)
        # correct the not uncommon "2007/20:08123" -> "2007/2008:123" issue
        total = attribs["rpubl:arsutgava"] + attribs["rpubl:lopnummer"]
        if total.isdigit() and int(total[:4]) - int(total[4:8]) == - 1:
            # convert to "2007/2008:123" and let santize_basefile make
            # canonical (and warn). This way we don't need to
            # specialcase "1999/2000:123"
            attribs["rpubl:arsutgava"] = total[:8]
            attribs["rpubl:lopnummer"] = total[8:]
        y = attribs["rpubl:arsutgava"]
        if "/" not in y:
            attribs['rpubl:arsutgava'] = "%s/%s" % (y[:4], y[4:])
        return attribs

    def sanitize_identifier(self, identifier):
        return prop_sanitize_identifier(identifier)


class PropTripsStore(FixedLayoutStore):
    # 1993/94 and 1994/95 has only plaintext (wrapped in .html)
    # 1995/96 to 2006/07 has plaintext + doc
    # 2007/08 onwards has plaintext, doc and pdf
    doctypes = OrderedDict([
        (".pdf", b'%PDF'),
        (".doc", b'\xd0\xcf\x11\xe0'),
        (".docx", b'PK\x03\x04'),
        (".wpd", b'\xffWPC'),
        (".html", b'<!DO'),
    ])

    def intermediate_path(self, basefile, version=None, attachment=None, suffix=None):
        # we need to select a suitable intermediate suffix based upon
        # the downloaded suffix (pdf->xml, html->txt)
        if self.downloaded_path(basefile).endswith(".html"):
            from ferenda.documentstore import _compressed_suffix
            return self.path(basefile, "intermediate", ".txt" + _compressed_suffix(self.compression))
        else:
            return super(PropTripsStore, self).intermediate_path(basefile, version, attachment, suffix)


# We derive from Trips for downloading, from FixedLayoutSource for
# downloaded_to_intermediate, extract_{head,metadata,body}, and from
# Offtryck for most everything else. FIXME: This is not manageble.
[docs]class PropTrips(Trips, Offtryck, FixedLayoutSource):
    alias = "proptrips"
    ar = ""
    start_url = "http://rkrattsbaser.gov.se/prop/adv?dok=P&sort=asc&ar={c.lastyear}"
    document_url_template = "http://rkrattsbaser.gov.se/prop?ar=%(year)s&dok=P&dokid=%(ordinal)s" 

    basefile_regex = "(?P<basefile>\d+/\d+:\d+)$"

    downloaded_suffix = ".html"
    rdf_type = RPUBL.Proposition
    KOMMITTEDIREKTIV = SOU = DS = None
    PROPOSITION = "prop"
    document_type = PROPOSITION

    storage_policy = "dir"
    documentstore_class = PropTripsStore
    urispace_segment = "prop"

    @classmethod
    def get_default_options(cls):
        opts = super(PropTrips, cls).get_default_options()
        opts['lastyear'] = ""
        return opts

    # don't use @recordlastdownload -- download_get_basefiles_page
    # should set self.config.lastyear instead
    def download(self, basefile=None):
        if self.config.ipbasedurls:
            self._make_ipbasedurls()
        urlmap_path = self.store.path("urls", "downloaded", ".map",
                                      storage_policy="file")
        self.urlmap = {}
        if os.path.exists(urlmap_path):
            with codecs.open(urlmap_path, encoding="utf-8") as fp:
                for line in fp:
                    url, attachment = line.split("\t")
                    self.urlmap[url] = attachment.strip()
        if basefile:
            return super(PropTrips, self).download(basefile)
        try:
            now = datetime.now()
            if ('lastyear' in self.config and
                    self.config.lastyear and
                    not self.config.refresh):
                maxyear = "%s/%s" % (now.year, (now.year + 1) % 100)
                while self.config.lastyear != maxyear:
                    r = self.inner_download() 
            else:
                self.config.lastyear = ''
                r = self.inner_download()
            self.config.lastyear = "%s/%s" % (now.year - 1,
                                              (now.year % 100))
            LayeredConfig.write(self.config)     # assume we have data to write
            return r
        finally:
            with codecs.open(urlmap_path, "w", encoding="utf-8") as fp:
                for url, attachment in self.urlmap.items():
                    fp.write("%s\t%s\n" % (url, attachment))

    def inner_download(self):
        refresh = self.config.refresh
        updated = False
        for basefile, url in self.download_get_basefiles(None):
            if url in self.urlmap:
                attachment = self.urlmap[url]
            else:
                attachment = self.sniff_attachment(url)
            if attachment:
                self.urlmap[url] = attachment
                attachment += ".html"
            else:
                self.urlmap[url] = ''
                attachment = None  # instead of the empty string
            if (refresh or
                    (not os.path.exists(self.store.downloaded_path(basefile, attachment=attachment)))):
                ret = self.download_single(basefile, url)
                updated = updated or ret
        return updated

    def sniff_attachment(self, url):
        r = requests.get(url, stream=True)
        head = r.raw.read(8000)
        soup = BeautifulSoup(head, "lxml")
        return self.find_attachment(soup)

    def find_attachment(self, soup):
        results = soup.find("div", "search-results-content")
        dokid = results.find("span", string="Dokument:")
        if not dokid:
            return None
        dokid = dokid.next_sibling.strip().split(" ")[-1]
        if "/" in dokid:
            dokid, attachment = dokid.split("/")
        else:
            attachment = None
        return attachment
        
    def _next_year(self, year):
        # "1992/93" -> "1993/94"
        # "1998/99" -> "1999/00"
        assert len(year) == 7, "invalid year specifier %s" % year
        y1, y2 = int(year[:4]) + 1, int(year[-2:]) + 1
        return "%04d/%02d" % (int(y1), int(y2) % 100)

    def _prev_year(self, year):
        # "1993/94" -> "1992/93"
        # "1999/00" -> "1998/99"
        assert len(year) == 7, "invalid year specifier %s" % year
        y1, y2 = int(year[:4]) - 1, int(year[-2:]) - 1
        return "%04d/%02d" % (int(y1), int(y2) % 100)

    def remote_url(self, basefile):
        year, ordinal = basefile.split(":")
        return self.document_url_template % locals()

    def download_get_basefiles_page(self, soup):
        nextpage = None
        for hit in soup.findAll("div", "search-hit-info-num"):
            basefile = hit.text.split(": ", 1)[1].strip()
            m = re.search(self.basefile_regex, basefile)
            if m:
                basefile = m.group()
            else:
                self.log.warning("Couldn't find a basefile in this label: %r" % basefile)
                continue
            docurl = urljoin(self.start_url, hit.parent.a["href"])
            yield(self.sanitize_basefile(basefile), docurl)
        nextpage = soup.find("div", "search-opt-next").a
        if nextpage:
            nextpage = urljoin(self.start_url,
                               nextpage.get("href"))
        else:
            if self.config.lastyear:
                b = self._next_year(self.config.lastyear)
            else:
                now = datetime.now()
                b = "%s/%s" % (now.year - 1, (now.year) % 100)
            self.log.info("Advancing year from %s to %s" % (self.config.lastyear, b))
            self.config.lastyear = b

        raise NoMoreLinks(nextpage)
    
    def download_single(self, basefile, url=None):
        if url is None:
            url = self.remote_url(basefile)
            if not url:  # remote_url failed
                return

        updated = created = False
        checked = True
        mainattachment = None

        if url in self.urlmap:
            attachment = self.urlmap[url]
        else:
            attachment = self.sniff_attachment(url)
        if attachment:
            self.urlmap[url] = attachment
            attachment += ".html"
        else:
            self.urlmap[url] = ''
            attachment = "index.html"
        
        downloaded_path = self.store.downloaded_path(basefile,
                                                     attachment=attachment)
        
        created = not os.path.exists(downloaded_path)
        if self.download_if_needed(url, basefile, filename=downloaded_path):
            text = util.readfile(downloaded_path)
            if "<div>Inga tr\xe4ffar</div>" in text:
                self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url))
                util.robust_remove(downloaded_path)
                return False
            if created:
                self.log.info("%s: download OK from %s" % (basefile, url))
            else:
                self.log.info(
                    "%s: download OK (new version) from %s" % (basefile, url))
            updated = True
        else:
            self.log.debug("%s: exists and is unchanged" % basefile)
            text = util.readfile(downloaded_path)
            
        soup = BeautifulSoup(text, "lxml")
        del text
        attachment = self.find_attachment(soup)

        extraurls = []
        results = soup.find("div", "search-results-content")
        a = results.find("a", string="Hämta Pdf")
        if a:
            extraurls.append(a.get("href"))
        a = results.find("a", string="Hämta Doc") 
        if a:
            extraurls.append(a.get("href"))
        

        # parse downloaded html/text page and find out extraurls
        for url in extraurls:
            if url.endswith('get=doc'):
                # NOTE: We cannot be sure that this is
                # actually a Word (CDF) file. For older files
                # it might be a WordPerfect file (.wpd) or a
                # RDF file, for newer it might be a .docx. We
                # cannot be sure until we've downloaded it.
                # So we quickly read the first 4 bytes
                r = requests.get(url, stream=True)
                sig = r.raw.read(4)
                # r.raw.close()
                #bodyidx = head.index("\n\n")
                #sig = head[bodyidx:bodyidx+4]
                if sig == b'\xffWPC':
                    doctype = ".wpd"
                elif sig == b'\xd0\xcf\x11\xe0':
                    doctype = ".doc"
                elif sig == b'PK\x03\x04':
                    doctype = ".docx"
                elif sig == b'{\\rt':
                    doctype = ".rtf"
                else:
                    self.log.error(
                        "%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
                    continue
            elif url.endswith('get=pdf'):
                doctype = ".pdf"
            else:
                self.log.warning("Unknown doc type %s" %
                                 url.split("get=")[-1])
                doctype = None
            if doctype:
                if attachment:
                    filename = self.store.downloaded_path(
                        basefile, attachment=attachment + doctype)
                else:
                    filename = self.store.downloaded_path(
                        basefile,
                        attachment="index" +
                        doctype)
                self.log.debug("%s: downloading attachment %s" % (basefile, filename))
                self.download_if_needed(url, basefile, filename=filename)

        entry = DocumentEntry(self.store.documententry_path(basefile))
        now = datetime.now()
        entry.orig_url = url
        if created:
            entry.orig_created = now
        if updated:
            entry.orig_updated = now
        if checked:
            entry.orig_checked = now
        entry.save()

        return updated

    # Correct some invalid identifiers spotted in the wild:
    # 1999/20 -> 1999/2000
    # 2000/2001 -> 2000/01
    # 1999/98 -> 1999/2000
    def sanitize_basefile(self, basefile):
        (y1, y2, idx) = re.split("[:/]", basefile)
        assert len(
            y1) == 4, "Basefile %s is invalid beyond sanitization" % basefile
        if y1 == "1999" and y2 != "2000":
            sanitized = "1999/2000:" + idx
            self.log.warning("Basefile given as %s, correcting to %s" %
                             (basefile, sanitized))
        elif (y1 != "1999" and
              (len(y2) != 2 or  # eg "2000/001"
               int(y1[2:]) + 1 != int(y2))):  # eg "1999/98

            sanitized = "%s/%02d:%s" % (y1, int(y1[2:]) + 1, idx)
            self.log.warning("Basefile given as %s, correcting to %s" %
                             (basefile, sanitized))
        else:
            sanitized = basefile
        return sanitized

    def sanitize_identifier(self, identifier):
        return prop_sanitize_identifier(identifier)

    # FixedLayoutSource.downloaded_to_intermediate will always convert
    # things to pdf, even html files. But if we only have html
    # (eg. plaintext, we should work with that)
    def downloaded_to_intermediate(self, basefile, attachment=None):
        downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
        if downloaded_path.endswith(".html"):
            return self._extract_text(basefile)
        else:
            return super(PropTrips, self).downloaded_to_intermediate(basefile, attachment)

    def extract_head(self, fp, basefile):
        # get metadata from plaintext html even if we have doc/pdf,
        # since plaintext is easiest to extract basic metadata from
        txt = self._extract_text_inner(basefile)[:1000]
        return txt.split("-"*64)[0]

    def extract_metadata(self, rawheader, basefile):
        d = self.metadata_from_basefile(basefile)
        lines = [x.strip() for x in rawheader.split("\n\n") if x.strip()]
        d["dcterms:identifier"] = "Prop. " + lines[0].split('\xb7')[1].strip()
        d["dcterms:title"] = lines[1].strip()
        for p in lines[2:]:
            if p.startswith("Ansvarig myndighet: "):
                d["rpubl:departement"] = p.split(": ", 1)[1]
            elif p.startswith("Dokument: "):
                pass
            else:
                self.log.warning("%s: Unknown header %s" % p)
        return d

    def sanitize_metadata(self, attribs, basefile):
        attribs = super(PropTrips, self).sanitize_metadata(attribs, basefile)
        if ('dcterms:title' in attribs and
            'dcterms:identifier' in attribs and
            attribs['dcterms:title'].endswith(attribs['dcterms:identifier'])):
            x = attribs['dcterms:title'][:-len(attribs['dcterms:identifier'])]
            attribs['dcterms:title'] = util.normalize_space(x)
        return attribs
    
    def extract_body(self, fp, basefile):
        if util.name_from_fp(fp).endswith((".txt", ".txt.bz2")):
            bodystring = fp.read()
            if isinstance(bodystring, bytes):
                # fp is opened in bytestream mode
                bodystring = bodystring.decode("utf-8")
            return TextReader(string=bodystring)
        else:
            reader = super(PropTrips, self).extract_body(fp, basefile)
            pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
            for page in reader:
                page.src = pdffile
            return reader

    def sanitize_body(self, rawbody):
        if isinstance(rawbody, TextReader):
            return rawbody
        else:
            return super(PropTrips, self).sanitize_body(rawbody)

    def get_parser(self, basefile, sanitized, initialstate=None, startpage=None, pagecount=None, parseconfig="default"):
        if isinstance(sanitized, TextReader):
            return self.textparser
        else:
            return super(PropTrips, self).get_parser(basefile, sanitized, initialstate, startpage, pagecount, parseconfig=parseconfig)

    def tokenize(self, reader):
        if isinstance(reader, TextReader):
            return reader.getiterator(reader.readparagraph)
        else:
            return super(PropTrips, self).tokenize(reader)


[docs]class PropRiksdagen(Riksdagen):
    alias = "propriksdagen"
    rdf_type = RPUBL.Proposition
    document_type = Riksdagen.PROPOSITION

    def sanitize_identifier(self, identifier):
        return prop_sanitize_identifier(identifier)

class PropKBStore(SwedishLegalStore):
    downloaded_suffixes = [".pdf", ".xml"]

class PropKB(Offtryck, PDFDocumentRepository):
    alias = "propkb"
    storage_policy = "dir"
    start_url = "https://riksdagstryck.kb.se/tvakammarriksdagen.html"
    rdf_type = RPUBL.Proposition
    basefile_regex = "prop_(?P<year>\d{4})(?P<type>_urtima|_höst|_a|_b|)__+(?P<no>\d+)(?:_(?P<part>\d+)|)"
    document_type = PROPOSITION = True
    SOU = DS = KOMMITTEDIREKTIV = False
    documentstore_class = PropKBStore
    
    @classmethod
    def get_default_options(cls):
        opts = super(PropKB, cls).get_default_options()
        opts['ocr'] = False
        return opts

    def download_get_first_page(self):
        # if we have already successfully downloaded everything, there
        # is no need to even make a single network request (and we'd
        # have to do at least 100 otherwise) since no new docs will
        # ever be published (normally -- and if they are, just set
        # config.refresh)
        if (not self.config.refresh and
            'lastdownload' in self.config and
            self.config.lastdownload):
            class DummyResp(object):
                def raise_for_status(self):
                    pass
                text = "<h1>no data</h1>"
            return DummyResp()
        else:
            return super(PropKB, self).download_get_first_page()

    proptype = {"": "",
                "_a": "", # 1914, 1958
                "_höst": "",
                "_b": "b", # also 1914, 1958
                "_urtima": "u"}
    
    @decorators.downloadmax
    def download_get_basefiles(self, source):
        yielded = set()
        if self.download_reverseorder:
            source = reversed(list(source))
        for (element, attribute, link, pos) in source:
            if not element.text_content():
                continue
            if "proposition" in element.text_content():
                resp = self.session.get(link)
                resp.raise_for_status()
                tree = lxml.html.document_fromstring(resp.text)
                tree.make_links_absolute(link, resolve_base_href=True)
                for (subelement, subattribute, sublink, subpos) in tree.iterlinks():
                    if not subelement.text:
                        continue
                    m = re.match(self.basefile_regex, subelement.text)
                    if m:
                        basefile = "%s:%s%s" % (m.group("year"), self.proptype[m.group("type")], m.group("no"))
                        exists = os.path.exists(self.store.downloaded_path(basefile))
                        if exists and not self.config.refresh:
                            continue
                        part = m.group("part")
                        if (basefile,part) in yielded:
                            continue
                        if self.get_parse_options(basefile) == "skip":
                            continue
                        if part and int(part) > 1 and self.get_parse_options(basefile) != "metadataonly":
                            # Download attachments ourselves -- not
                            # really what download_get_basefile should
                            # do, but hey....
                            filename = self.store.downloaded_path(basefile, attachment=part+".pdf")
                            self.download_if_needed(sublink, basefile, archive=self.download_archive, filename=filename)
                        else:
                            yield basefile, sublink
                            yielded.add((basefile,part))

    def metadata_from_basefile(self, basefile):
        attrib = super(PropKB, self).metadata_from_basefile(basefile) 
        year, ordinal = basefile.split(":")
        attrib["rpubl:arsutgava"] = year
        attrib["rpubl:lopnummer"] = ordinal
        return attrib

    def download_single(self, basefile, url=None):
        if not url:
            entry = DocumentEntry(self.store.documententry_path(basefile))
            url = entry.orig_url
        xml_downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml")
        if self.get_parse_options(basefile) == "metadataonly":
            # in these cases, to save space, get
            # the smaller XML OCR data, not the
            # actual scanned images-in-PDF
            url = url.replace(".pdf", ".xml").replace("pdf/web", "xml")
            # make store.downloaded_path return .xml suffixes (and set
            # the timestamp to the beginning of epoch so that the
            # resulting if-modified-since header doesn't contain the
            # current date/time
            if not os.path.exists(xml_downloaded_path):
                util.writefile(xml_downloaded_path, "")
                os.utime(xml_downloaded_path, (0,0))
        else:
            # if parse options have changed from metadataonly to
            # default, there will be a xml file lying about which will
            # make downloaded_path return its name. Remove it so that
            # we don't end up with pdf files that have a .xml
            # extension.
            if os.path.exists(xml_downloaded_path):
                os.unlink(xml_downloaded_path)
        return super(PropKB, self).download_single(basefile, url)
        
    def download_is_different(self, existing, new):
        return not filecmp.cmp(new, existing, shallow=False)

    # @lazyread
    def downloaded_to_intermediate(self, basefile, attachment=None):
        downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
        if downloaded_path.endswith(".xml"):
            return open(downloaded_path)
        else:
            intermediate_path = self.store.intermediate_path(basefile)
            return self.convert_pdf(downloaded_path, intermediate_path)

    def convert_pdf(self, downloaded_path, intermediate_path):
        intermediate_dir = os.path.dirname(intermediate_path)
        keep_xml = "bz2" if self.config.compress == "bz2" else True
        reader = StreamingPDFReader()
        kwargs = {'filename': downloaded_path,
                  'workdir': intermediate_dir,
                  'images': self.config.pdfimages,
                  'keep_xml': keep_xml}
        if self.config.ocr:
            kwargs['ocr_lang'] = 'swe'
        return reader.convert(**kwargs)

    def extract_head(self, fp, basefile):
        if self.get_parse_options(basefile) == "metadataonly":
            tree = etree.parse(fp)
            firstpage = tree.find("//{http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml}page")
            return firstpage
        else:
            return None  # "rawhead" is never used

    def extract_metadata(self, rawhead, basefile):
        res = self.metadata_from_basefile(basefile)
        # extracting title and other metadata (dep, publication date
        # etc) requires parsing of the body (and subsequent processing
        # in postprocess_doc). For documents marked as metadataonly in
        # options.py, the body is never parsed. Therefore, we do a
        # very limited parsing of the first page here.
        if self.get_parse_options(basefile) == "metadataonly":
            text = util.normalize_space(etree.tostring(rawhead, method="text", encoding="utf-8").decode("utf-8"))
            res.update(self.find_firstpage_metadata(text, basefile))
        return res

    def find_firstpage_metadata(self, firstpage, basefile):
        res = {}
        m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
                      util.normalize_space(firstpage), flags=re.I)
        if not m:
            self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
                             (basefile, len(firstpage)))
        else:
            res["dcterms:title"] = m.groups(1)
        m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
        if not m:
            self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
                             (basefile, len(firstpage)))
        else:
            try:
                res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
            except ValueError as e:
                self.log.warning("%s: Couldn't parse date %s" % (basefile, m.group(1)))
        return res

    def extract_body(self, fp, basefile):
        reader = StreamingPDFReader()
        parser = "ocr" if self.config.ocr else "xml"
        intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
        if self.config.compress:
            intermediate_suffix += "." + self.config.compress
        reader.read(fp, parser=parser)
        for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]:
            downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
            iattachment = attachment.replace(".pdf", intermediate_suffix)
            intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment)
            if not os.path.exists(intermediate_path):
                fp = self.convert_pdf(downloaded_path, intermediate_path)
            else:
                fp = self.store.open_intermediate(basefile, attachment=iattachment)
            reader += StreamingPDFReader().read(fp)

        for page in reader:
            page.src = "index.pdf"  # FIXME: don't hardcode the filename
        return reader

    def postprocess_doc(self, doc):
        if self.get_parse_options(doc.basefile) == "metadataonly":
            return
        # the first thing will be a Sidbrytning; continue scanning text until next sidbrytning
        firstpage = ""
        for thing in doc.body[1:]:
            if isinstance(thing, Sidbrytning):
                break
            elif isinstance(thing, Textbox):
                firstpage += util.normalize_space(str(thing)) + "\n\n"
        metadata = self.find_firstpage_metadata(firstpage, doc.basefile)
        if "dcterms:title" in metadata:
            doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(metadata["dcterms:title"], lang=self.lang)))
        if "dcterms:issued" in metadata:
            doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(metadata["dcterms:issued"])))


# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
class PropositionerStore(CompositeStore, SwedishLegalStore):
    pass


[docs]class Propositioner(CompositeRepository, FixedLayoutSource):
    subrepos = PropRegeringen, PropTrips, PropRiksdagen, PropKB
    alias = "prop"
    xslt_template = "xsl/forarbete.xsl"
    storage_policy = "dir"
    rdf_type = RPUBL.Proposition
    documentstore_class = PropositionerStore
    sparql_annotations = "sparql/describe-with-subdocs.rq"
    sparql_expect_results = False

    # NB: The same logic as in
    # ferenda.sources.legal.se.{Regeringen,Riksdagen}.metadata_from_basefile
    def metadata_from_basefile(self, basefile):
        a = super(Propositioner, self).metadata_from_basefile(basefile)
        a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
        return a

    def facets(self):
        return super(Propositioner, self).facets() + [Facet(DCTERMS.title,
                                                       toplevel_only=False)]

    def tabs(self):
        if self.config.tabs:
            return [('Propositioner', self.dataset_uri())]
        else:
            return []

    # For a certain repo, download_path might return *.wpd (good) or
    # *.html (bad, because unformatted plaintext). If it returns bad,
    # we should continue with other repos that might have
    # *.pdf. HOWEVER, if no other repo has it in any format, we'll
    # have to accept the repo that has it as *.html.
    #
    # NOTE: This implementation does not make use of the
    # self.store.basefiles[c] cache, since that only keeps track of
    # which repos has which basefiles, not the format/quality of the
    # source.
    def get_preferred_instances(self, basefile):
        backups = []
        for c in self.subrepos:
            inst = self.get_instance(c)
            source_candidate = inst.store.downloaded_path(basefile)
            if os.path.exists(source_candidate):
                if c.alias != "propregeringen" and source_candidate.endswith(".html"):
                    backups.append(inst)
                else:
                    yield(inst)
        for inst in backups:
            yield(inst)