# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
from future import standard_library
standard_library.install_aliases()
import re
import os
from datetime import datetime
from collections import OrderedDict, Counter
import codecs
from urllib.parse import urljoin
import json
import tempfile
import filecmp
from bs4 import BeautifulSoup
from lxml import etree
import lxml.html
import requests
from layeredconfig import LayeredConfig
from cached_property import cached_property
from rdflib import Literal, URIRef
from rdflib.namespace import DCTERMS
from ferenda import util, decorators
from ferenda.elements import Preformatted, Body
from ferenda import CompositeRepository, CompositeStore
from ferenda import TextReader, PDFAnalyzer
from ferenda import DocumentEntry, Facet, PDFDocumentRepository
from ferenda.pdfreader import StreamingPDFReader, Textbox
from . import (Trips, NoMoreLinks, Regeringen, Riksdagen,
SwedishLegalSource, SwedishLegalStore, RPUBL, Offtryck)
from .fixedlayoutsource import FixedLayoutStore, FixedLayoutSource
from .swedishlegalsource import lazyread, SwedishLegalStore
from .elements import Sidbrytning
def prop_sanitize_identifier(identifier):
if not identifier:
return identifier # allow infer_identifier to do it's magic later
if identifier.startswith("prop"):
identifier = util.ucfirst(identifier)
if identifier.startswith("PROP"):
identifier = identifier.replace("PROP", "Prop")
if identifier.startswith("Prop "):
identifier = identifier.replace("Prop ", "Prop. ")
if re.match("Prop\.\d{4}", identifier): # missing space
identifier = identifier.replace("Prop.", "Prop. ")
if "\xa0" in identifier: # Non-breakable space
identifier = identifier.replace("\xa0", " ")
if not identifier.startswith("Prop. "):
identifier = "Prop. " + identifier
# identify and correct the not-uncommon "2009/2010:87" pattern (should be 2009/10:87)
m = re.search(r"(\d{4})/(\d{4}):(\d+)$", identifier)
if m and m.group(2) != "2000" and int(m.group(1)) == int(m.group(2)) - 1:
identifier = identifier.replace(m.group(2), m.group(2)[-2:])
if not re.match(r"^Prop\. (19|20)\d{2}(|/\d{2}|/2000):(|B ?|U ?)[1-9]\d{0,2}$", identifier):
raise ValueError("Irregular identifier %s" % identifier)
return Literal(identifier)
class PropAnalyzer(PDFAnalyzer):
# NOTE: The cutoff used to be 0.5% but it turns out that in
# particular h2's can be quite rare, occuring maybe two times
# in an entire document.
style_significance_threshold = 0.001
@cached_property
def documents(self):
def boxmatch(page, textpattern, bb=None):
if bb is None:
bb = page.boundingbox(bottom=page.height / 5)
for box in bb:
m = re.match(textpattern, str(box))
if m:
return m.group(1)
return None
documents = []
mainstyles = Counter()
pagedims = {'pagewidth': util.TopCounter(),
'pageheight': util.TopCounter()}
currentappendix = None
for pageidx, page in enumerate(self.pdf):
styles = self.count_styles(pageidx, 1)
# find the most dominant style on the page. If it uses the
# EU font, it's a separate section.
if styles and styles.most_common(1)[0][0][0].startswith("EUAlbertina"):
currentdoc = 'eudok'
currentappendix = boxmatch(page, "Bilaga (\d)\s*$")
else:
# if there is a text box matching "Bilaga \d" in top
# margin and the bilagenummer is new and dominant
# style (family) is different from any of the
# top 3 currrent dominant styles:
#
# NOTE that normally we want to treat appendicies as
# part of the regular text (so that
# offtryck_parser.is_appendix et al can do their
# thing. This heuristic should only catch appendicies
# that are v. different.
appendix = boxmatch(page, "Bilaga (\d)\s*$")
if (appendix and
appendix != currentappendix and
styles.most_common(1) and
styles.most_common(1)[0][0][0] not in [x[0][0] for x in mainstyles.most_common(3)]):
currentdoc = 'appendix'
elif ".hocr." in self.pdf.filename:
# scanned sources have fluctuating page sizes,
# plus it's not possible to identify appendicies
# by differing page dimensions
currentdoc = "main"
elif pageidx == 0 and boxmatch(page, "(REGERINGENS PROPOSITION)", page.boundingbox(top=page.height * 0.8)):
currentdoc = "frontmatter"
else:
if (pagedims['pageheight'] and
(abs(pagedims['pageheight'].top() - page.height) > 1 or
abs(pagedims['pagewidth'].top() - page.width) > 1)):
# if the page dimensions suddenly change,
# that's a dead giveaway that some external
# appendix has been lifted right into the PDF
#
# But in some cases dimension change does NOT
# mean external appendix. In Prop 2015/16:195,
# which is split in 4 pdfs (2 logical volumes)
# it's just an artifact due to the 2nd pdf
# being properly cropped while the 1st
# isn't. In prop 2008/09:140, which
# uncharacteristically includes frontmatter, a
# dimension change signals the change from
# frontmatter to main
if currentdoc == "frontmatter":
currentdoc = "main"
else:
currentdoc = 'appendix'
else:
currentdoc = 'main'
currentappendix = appendix
if currentdoc == "main":
mainstyles += styles
pagedims['pagewidth'][page.width] += 1
pagedims['pageheight'][page.height] += 1
# update the current document segment tuple or start a new one
if documents and documents[-1][2] == currentdoc:
documents[-1][1] += 1
else:
documents.append([pageidx, 1, currentdoc])
return documents
def guess_pagenumber_select(self, candidates, probable_pagenumber):
if self.scanned_source:
# try to avoid assuming that smudges and crap equals
# lower-case L and other things that might be interpreted
# as roman numeral
if util.is_roman(candidates[0]) and str(probable_pagenumber) == "1":
return 1 # Do not interpret a single 'l' as roman 50
# -- it's probably a badly OCR:ed '
else:
# be a little more conservative with what a good guess
# is compared to PDFAnalyzer.guess_pagenumber_select:
# only accept the smallest candidate larger-or-equal
# to the probable_pagenumber -- but not if it's a
# too-large gap. Also, assume no roman numerals
try:
return next(c for c in sorted(candidates) if c >= probable_pagenumber and c <= probable_pagenumber * 2)
except StopIteration: # no suitable candidate
return None
# otherwise fall back to superclass implementation
return super(PropAnalyzer, self).guess_pagenumber_select(candidates, probable_pagenumber)
def guess_pagenumber_boxes(self, page):
"""Return a suitable number of textboxes to scan for a possible page number. """
if self.scanned_source:
# For scanned source, the default strategy works so-so
# (many OCR errors may result in misinterpreting things as
# pagenumbers) so we also take into account the text box
# property. Only select thin boxes (less than 1/50th of
# the page width) -- page numbers should stand by
# themselves and naturally be pretty thin
return [b for b in list(reversed(page))[:5] + list(page)[:5] if b.width < page.width/50]
else:
return super(PropAnalyzer, self).guess_pagenumber_boxes(page)
def metrics(self, metricspath=None, plotpath=None, startpage=0,
pagecount=None, force=False):
docsegments = self.documents
if len(docsegments) == 1:
return super(PropAnalyzer, self).metrics(metricspath,
plotpath,
startpage,
pagecount, force)
else:
r = []
exclude = []
mainidx = None
for idx, (startpage, pagecount, tag) in enumerate(docsegments):
r.append(super(PropAnalyzer,
self).metrics(startpage=startpage,
pagecount=pagecount))
if tag != 'main':
exclude.extend(list(range(startpage, startpage+pagecount)))
elif mainidx is None:
mainidx = idx
r[mainidx]['excludedpages'] = exclude
# since we don't pass metricspath to super().metrics, that
# func does not create a metrics.json cache file. So we
# generate that now (using the same data as we return)
util.ensure_dir(metricspath)
with open(metricspath, "w") as fp:
s = json.dumps(r[mainidx], indent=4, separators=(', ', ': '), sort_keys=True)
fp.write(s)
return r[mainidx]
def count_styles(self, startpage, pagecount):
# we should avoid counting the styles on the front page, as
# that page uses a title font, not used anywhere else in the
# document, which is then mistaken for the h1 font.
if not startpage:
startpage = 1
return super(PropAnalyzer, self).count_styles(startpage, pagecount)
[docs]class PropRegeringen(Regeringen):
alias = "propregeringen"
re_basefile_strict = re.compile(r'Prop. (\d{4}/\d{2,4}:\d+)')
re_basefile_lax = re.compile(
r'(?:Prop\.?|) ?(\d{4}/\d{2,4}:\d+)', re.IGNORECASE)
re_urlbasefile_strict = re.compile("proposition/\d+/\d+/[a-z]*\.?-?(\d{6})(\d+)-?/$")
re_urlbasefile_lax = re.compile("proposition/\d+/\d+/.*?(\d{4}_?\d{2})[_-]?(\d+)")
rdf_type = RPUBL.Proposition
document_type = Regeringen.PROPOSITION
# sparql_annotations = "sparql/prop-annotations.rq"
def attribs_from_url(self, url):
attribs = super(PropRegeringen, self).attribs_from_url(url)
# correct the not uncommon "2007/20:08123" -> "2007/2008:123" issue
total = attribs["rpubl:arsutgava"] + attribs["rpubl:lopnummer"]
if total.isdigit() and int(total[:4]) - int(total[4:8]) == - 1:
# convert to "2007/2008:123" and let santize_basefile make
# canonical (and warn). This way we don't need to
# specialcase "1999/2000:123"
attribs["rpubl:arsutgava"] = total[:8]
attribs["rpubl:lopnummer"] = total[8:]
y = attribs["rpubl:arsutgava"]
if "/" not in y:
attribs['rpubl:arsutgava'] = "%s/%s" % (y[:4], y[4:])
return attribs
def sanitize_identifier(self, identifier):
return prop_sanitize_identifier(identifier)
class PropTripsStore(FixedLayoutStore):
# 1993/94 and 1994/95 has only plaintext (wrapped in .html)
# 1995/96 to 2006/07 has plaintext + doc
# 2007/08 onwards has plaintext, doc and pdf
doctypes = OrderedDict([
(".pdf", b'%PDF'),
(".doc", b'\xd0\xcf\x11\xe0'),
(".docx", b'PK\x03\x04'),
(".wpd", b'\xffWPC'),
(".html", b'<!DO'),
])
def intermediate_path(self, basefile, version=None, attachment=None, suffix=None):
# we need to select a suitable intermediate suffix based upon
# the downloaded suffix (pdf->xml, html->txt)
if self.downloaded_path(basefile).endswith(".html"):
from ferenda.documentstore import _compressed_suffix
return self.path(basefile, "intermediate", ".txt" + _compressed_suffix(self.compression))
else:
return super(PropTripsStore, self).intermediate_path(basefile, version, attachment, suffix)
# We derive from Trips for downloading, from FixedLayoutSource for
# downloaded_to_intermediate, extract_{head,metadata,body}, and from
# Offtryck for most everything else. FIXME: This is not manageble.
[docs]class PropTrips(Trips, Offtryck, FixedLayoutSource):
alias = "proptrips"
ar = ""
start_url = "http://rkrattsbaser.gov.se/prop/adv?dok=P&sort=asc&ar={c.lastyear}"
document_url_template = "http://rkrattsbaser.gov.se/prop?ar=%(year)s&dok=P&dokid=%(ordinal)s"
basefile_regex = "(?P<basefile>\d+/\d+:\d+)$"
downloaded_suffix = ".html"
rdf_type = RPUBL.Proposition
KOMMITTEDIREKTIV = SOU = DS = None
PROPOSITION = "prop"
document_type = PROPOSITION
storage_policy = "dir"
documentstore_class = PropTripsStore
urispace_segment = "prop"
@classmethod
def get_default_options(cls):
opts = super(PropTrips, cls).get_default_options()
opts['lastyear'] = ""
return opts
# don't use @recordlastdownload -- download_get_basefiles_page
# should set self.config.lastyear instead
def download(self, basefile=None):
if self.config.ipbasedurls:
self._make_ipbasedurls()
urlmap_path = self.store.path("urls", "downloaded", ".map",
storage_policy="file")
self.urlmap = {}
if os.path.exists(urlmap_path):
with codecs.open(urlmap_path, encoding="utf-8") as fp:
for line in fp:
url, attachment = line.split("\t")
self.urlmap[url] = attachment.strip()
if basefile:
return super(PropTrips, self).download(basefile)
try:
now = datetime.now()
if ('lastyear' in self.config and
self.config.lastyear and
not self.config.refresh):
maxyear = "%s/%s" % (now.year, (now.year + 1) % 100)
while self.config.lastyear != maxyear:
r = self.inner_download()
else:
self.config.lastyear = ''
r = self.inner_download()
self.config.lastyear = "%s/%s" % (now.year - 1,
(now.year % 100))
LayeredConfig.write(self.config) # assume we have data to write
return r
finally:
with codecs.open(urlmap_path, "w", encoding="utf-8") as fp:
for url, attachment in self.urlmap.items():
fp.write("%s\t%s\n" % (url, attachment))
def inner_download(self):
refresh = self.config.refresh
updated = False
for basefile, url in self.download_get_basefiles(None):
if url in self.urlmap:
attachment = self.urlmap[url]
else:
attachment = self.sniff_attachment(url)
if attachment:
self.urlmap[url] = attachment
attachment += ".html"
else:
self.urlmap[url] = ''
attachment = None # instead of the empty string
if (refresh or
(not os.path.exists(self.store.downloaded_path(basefile, attachment=attachment)))):
ret = self.download_single(basefile, url)
updated = updated or ret
return updated
def sniff_attachment(self, url):
r = requests.get(url, stream=True)
head = r.raw.read(8000)
soup = BeautifulSoup(head, "lxml")
return self.find_attachment(soup)
def find_attachment(self, soup):
results = soup.find("div", "search-results-content")
dokid = results.find("span", string="Dokument:")
if not dokid:
return None
dokid = dokid.next_sibling.strip().split(" ")[-1]
if "/" in dokid:
dokid, attachment = dokid.split("/")
else:
attachment = None
return attachment
def _next_year(self, year):
# "1992/93" -> "1993/94"
# "1998/99" -> "1999/00"
assert len(year) == 7, "invalid year specifier %s" % year
y1, y2 = int(year[:4]) + 1, int(year[-2:]) + 1
return "%04d/%02d" % (int(y1), int(y2) % 100)
def _prev_year(self, year):
# "1993/94" -> "1992/93"
# "1999/00" -> "1998/99"
assert len(year) == 7, "invalid year specifier %s" % year
y1, y2 = int(year[:4]) - 1, int(year[-2:]) - 1
return "%04d/%02d" % (int(y1), int(y2) % 100)
def remote_url(self, basefile):
year, ordinal = basefile.split(":")
return self.document_url_template % locals()
def download_get_basefiles_page(self, soup):
nextpage = None
for hit in soup.findAll("div", "search-hit-info-num"):
basefile = hit.text.split(": ", 1)[1].strip()
m = re.search(self.basefile_regex, basefile)
if m:
basefile = m.group()
else:
self.log.warning("Couldn't find a basefile in this label: %r" % basefile)
continue
docurl = urljoin(self.start_url, hit.parent.a["href"])
yield(self.sanitize_basefile(basefile), docurl)
nextpage = soup.find("div", "search-opt-next").a
if nextpage:
nextpage = urljoin(self.start_url,
nextpage.get("href"))
else:
if self.config.lastyear:
b = self._next_year(self.config.lastyear)
else:
now = datetime.now()
b = "%s/%s" % (now.year - 1, (now.year) % 100)
self.log.info("Advancing year from %s to %s" % (self.config.lastyear, b))
self.config.lastyear = b
raise NoMoreLinks(nextpage)
def download_single(self, basefile, url=None):
if url is None:
url = self.remote_url(basefile)
if not url: # remote_url failed
return
updated = created = False
checked = True
mainattachment = None
if url in self.urlmap:
attachment = self.urlmap[url]
else:
attachment = self.sniff_attachment(url)
if attachment:
self.urlmap[url] = attachment
attachment += ".html"
else:
self.urlmap[url] = ''
attachment = "index.html"
downloaded_path = self.store.downloaded_path(basefile,
attachment=attachment)
created = not os.path.exists(downloaded_path)
if self.download_if_needed(url, basefile, filename=downloaded_path):
text = util.readfile(downloaded_path)
if "<div>Inga tr\xe4ffar</div>" in text:
self.log.warning("%s: Could not find this prop at %s, might be a bug" % (basefile, url))
util.robust_remove(downloaded_path)
return False
if created:
self.log.info("%s: download OK from %s" % (basefile, url))
else:
self.log.info(
"%s: download OK (new version) from %s" % (basefile, url))
updated = True
else:
self.log.debug("%s: exists and is unchanged" % basefile)
text = util.readfile(downloaded_path)
soup = BeautifulSoup(text, "lxml")
del text
attachment = self.find_attachment(soup)
extraurls = []
results = soup.find("div", "search-results-content")
a = results.find("a", string="Hämta Pdf")
if a:
extraurls.append(a.get("href"))
a = results.find("a", string="Hämta Doc")
if a:
extraurls.append(a.get("href"))
# parse downloaded html/text page and find out extraurls
for url in extraurls:
if url.endswith('get=doc'):
# NOTE: We cannot be sure that this is
# actually a Word (CDF) file. For older files
# it might be a WordPerfect file (.wpd) or a
# RDF file, for newer it might be a .docx. We
# cannot be sure until we've downloaded it.
# So we quickly read the first 4 bytes
r = requests.get(url, stream=True)
sig = r.raw.read(4)
# r.raw.close()
#bodyidx = head.index("\n\n")
#sig = head[bodyidx:bodyidx+4]
if sig == b'\xffWPC':
doctype = ".wpd"
elif sig == b'\xd0\xcf\x11\xe0':
doctype = ".doc"
elif sig == b'PK\x03\x04':
doctype = ".docx"
elif sig == b'{\\rt':
doctype = ".rtf"
else:
self.log.error(
"%s: Attached file has signature %r -- don't know what type this is" % (basefile, sig))
continue
elif url.endswith('get=pdf'):
doctype = ".pdf"
else:
self.log.warning("Unknown doc type %s" %
url.split("get=")[-1])
doctype = None
if doctype:
if attachment:
filename = self.store.downloaded_path(
basefile, attachment=attachment + doctype)
else:
filename = self.store.downloaded_path(
basefile,
attachment="index" +
doctype)
self.log.debug("%s: downloading attachment %s" % (basefile, filename))
self.download_if_needed(url, basefile, filename=filename)
entry = DocumentEntry(self.store.documententry_path(basefile))
now = datetime.now()
entry.orig_url = url
if created:
entry.orig_created = now
if updated:
entry.orig_updated = now
if checked:
entry.orig_checked = now
entry.save()
return updated
# Correct some invalid identifiers spotted in the wild:
# 1999/20 -> 1999/2000
# 2000/2001 -> 2000/01
# 1999/98 -> 1999/2000
def sanitize_basefile(self, basefile):
(y1, y2, idx) = re.split("[:/]", basefile)
assert len(
y1) == 4, "Basefile %s is invalid beyond sanitization" % basefile
if y1 == "1999" and y2 != "2000":
sanitized = "1999/2000:" + idx
self.log.warning("Basefile given as %s, correcting to %s" %
(basefile, sanitized))
elif (y1 != "1999" and
(len(y2) != 2 or # eg "2000/001"
int(y1[2:]) + 1 != int(y2))): # eg "1999/98
sanitized = "%s/%02d:%s" % (y1, int(y1[2:]) + 1, idx)
self.log.warning("Basefile given as %s, correcting to %s" %
(basefile, sanitized))
else:
sanitized = basefile
return sanitized
def sanitize_identifier(self, identifier):
return prop_sanitize_identifier(identifier)
# FixedLayoutSource.downloaded_to_intermediate will always convert
# things to pdf, even html files. But if we only have html
# (eg. plaintext, we should work with that)
def downloaded_to_intermediate(self, basefile, attachment=None):
downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
if downloaded_path.endswith(".html"):
return self._extract_text(basefile)
else:
return super(PropTrips, self).downloaded_to_intermediate(basefile, attachment)
def extract_head(self, fp, basefile):
# get metadata from plaintext html even if we have doc/pdf,
# since plaintext is easiest to extract basic metadata from
txt = self._extract_text_inner(basefile)[:1000]
return txt.split("-"*64)[0]
def extract_metadata(self, rawheader, basefile):
d = self.metadata_from_basefile(basefile)
lines = [x.strip() for x in rawheader.split("\n\n") if x.strip()]
d["dcterms:identifier"] = "Prop. " + lines[0].split('\xb7')[1].strip()
d["dcterms:title"] = lines[1].strip()
for p in lines[2:]:
if p.startswith("Ansvarig myndighet: "):
d["rpubl:departement"] = p.split(": ", 1)[1]
elif p.startswith("Dokument: "):
pass
else:
self.log.warning("%s: Unknown header %s" % p)
return d
def sanitize_metadata(self, attribs, basefile):
attribs = super(PropTrips, self).sanitize_metadata(attribs, basefile)
if ('dcterms:title' in attribs and
'dcterms:identifier' in attribs and
attribs['dcterms:title'].endswith(attribs['dcterms:identifier'])):
x = attribs['dcterms:title'][:-len(attribs['dcterms:identifier'])]
attribs['dcterms:title'] = util.normalize_space(x)
return attribs
def extract_body(self, fp, basefile):
if util.name_from_fp(fp).endswith((".txt", ".txt.bz2")):
bodystring = fp.read()
if isinstance(bodystring, bytes):
# fp is opened in bytestream mode
bodystring = bodystring.decode("utf-8")
return TextReader(string=bodystring)
else:
reader = super(PropTrips, self).extract_body(fp, basefile)
pdffile = self.store.downloaded_path(basefile, attachment="index.pdf")
for page in reader:
page.src = pdffile
return reader
def sanitize_body(self, rawbody):
if isinstance(rawbody, TextReader):
return rawbody
else:
return super(PropTrips, self).sanitize_body(rawbody)
def get_parser(self, basefile, sanitized, initialstate=None, startpage=None, pagecount=None, parseconfig="default"):
if isinstance(sanitized, TextReader):
return self.textparser
else:
return super(PropTrips, self).get_parser(basefile, sanitized, initialstate, startpage, pagecount, parseconfig=parseconfig)
def tokenize(self, reader):
if isinstance(reader, TextReader):
return reader.getiterator(reader.readparagraph)
else:
return super(PropTrips, self).tokenize(reader)
[docs]class PropRiksdagen(Riksdagen):
alias = "propriksdagen"
rdf_type = RPUBL.Proposition
document_type = Riksdagen.PROPOSITION
def sanitize_identifier(self, identifier):
return prop_sanitize_identifier(identifier)
class PropKBStore(SwedishLegalStore):
downloaded_suffixes = [".pdf", ".xml"]
class PropKB(Offtryck, PDFDocumentRepository):
alias = "propkb"
storage_policy = "dir"
start_url = "https://riksdagstryck.kb.se/tvakammarriksdagen.html"
rdf_type = RPUBL.Proposition
basefile_regex = "prop_(?P<year>\d{4})(?P<type>_urtima|_höst|_a|_b|)__+(?P<no>\d+)(?:_(?P<part>\d+)|)"
document_type = PROPOSITION = True
SOU = DS = KOMMITTEDIREKTIV = False
documentstore_class = PropKBStore
@classmethod
def get_default_options(cls):
opts = super(PropKB, cls).get_default_options()
opts['ocr'] = False
return opts
def download_get_first_page(self):
# if we have already successfully downloaded everything, there
# is no need to even make a single network request (and we'd
# have to do at least 100 otherwise) since no new docs will
# ever be published (normally -- and if they are, just set
# config.refresh)
if (not self.config.refresh and
'lastdownload' in self.config and
self.config.lastdownload):
class DummyResp(object):
def raise_for_status(self):
pass
text = "<h1>no data</h1>"
return DummyResp()
else:
return super(PropKB, self).download_get_first_page()
proptype = {"": "",
"_a": "", # 1914, 1958
"_höst": "",
"_b": "b", # also 1914, 1958
"_urtima": "u"}
@decorators.downloadmax
def download_get_basefiles(self, source):
yielded = set()
if self.download_reverseorder:
source = reversed(list(source))
for (element, attribute, link, pos) in source:
if not element.text_content():
continue
if "proposition" in element.text_content():
resp = self.session.get(link)
resp.raise_for_status()
tree = lxml.html.document_fromstring(resp.text)
tree.make_links_absolute(link, resolve_base_href=True)
for (subelement, subattribute, sublink, subpos) in tree.iterlinks():
if not subelement.text:
continue
m = re.match(self.basefile_regex, subelement.text)
if m:
basefile = "%s:%s%s" % (m.group("year"), self.proptype[m.group("type")], m.group("no"))
exists = os.path.exists(self.store.downloaded_path(basefile))
if exists and not self.config.refresh:
continue
part = m.group("part")
if (basefile,part) in yielded:
continue
if self.get_parse_options(basefile) == "skip":
continue
if part and int(part) > 1 and self.get_parse_options(basefile) != "metadataonly":
# Download attachments ourselves -- not
# really what download_get_basefile should
# do, but hey....
filename = self.store.downloaded_path(basefile, attachment=part+".pdf")
self.download_if_needed(sublink, basefile, archive=self.download_archive, filename=filename)
else:
yield basefile, sublink
yielded.add((basefile,part))
def metadata_from_basefile(self, basefile):
attrib = super(PropKB, self).metadata_from_basefile(basefile)
year, ordinal = basefile.split(":")
attrib["rpubl:arsutgava"] = year
attrib["rpubl:lopnummer"] = ordinal
return attrib
def download_single(self, basefile, url=None):
if not url:
entry = DocumentEntry(self.store.documententry_path(basefile))
url = entry.orig_url
xml_downloaded_path = self.store.downloaded_path(basefile).replace(".pdf", ".xml")
if self.get_parse_options(basefile) == "metadataonly":
# in these cases, to save space, get
# the smaller XML OCR data, not the
# actual scanned images-in-PDF
url = url.replace(".pdf", ".xml").replace("pdf/web", "xml")
# make store.downloaded_path return .xml suffixes (and set
# the timestamp to the beginning of epoch so that the
# resulting if-modified-since header doesn't contain the
# current date/time
if not os.path.exists(xml_downloaded_path):
util.writefile(xml_downloaded_path, "")
os.utime(xml_downloaded_path, (0,0))
else:
# if parse options have changed from metadataonly to
# default, there will be a xml file lying about which will
# make downloaded_path return its name. Remove it so that
# we don't end up with pdf files that have a .xml
# extension.
if os.path.exists(xml_downloaded_path):
os.unlink(xml_downloaded_path)
return super(PropKB, self).download_single(basefile, url)
def download_is_different(self, existing, new):
return not filecmp.cmp(new, existing, shallow=False)
# @lazyread
def downloaded_to_intermediate(self, basefile, attachment=None):
downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
if downloaded_path.endswith(".xml"):
return open(downloaded_path)
else:
intermediate_path = self.store.intermediate_path(basefile)
return self.convert_pdf(downloaded_path, intermediate_path)
def convert_pdf(self, downloaded_path, intermediate_path):
intermediate_dir = os.path.dirname(intermediate_path)
keep_xml = "bz2" if self.config.compress == "bz2" else True
reader = StreamingPDFReader()
kwargs = {'filename': downloaded_path,
'workdir': intermediate_dir,
'images': self.config.pdfimages,
'keep_xml': keep_xml}
if self.config.ocr:
kwargs['ocr_lang'] = 'swe'
return reader.convert(**kwargs)
def extract_head(self, fp, basefile):
if self.get_parse_options(basefile) == "metadataonly":
tree = etree.parse(fp)
firstpage = tree.find("//{http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml}page")
return firstpage
else:
return None # "rawhead" is never used
def extract_metadata(self, rawhead, basefile):
res = self.metadata_from_basefile(basefile)
# extracting title and other metadata (dep, publication date
# etc) requires parsing of the body (and subsequent processing
# in postprocess_doc). For documents marked as metadataonly in
# options.py, the body is never parsed. Therefore, we do a
# very limited parsing of the first page here.
if self.get_parse_options(basefile) == "metadataonly":
text = util.normalize_space(etree.tostring(rawhead, method="text", encoding="utf-8").decode("utf-8"))
res.update(self.find_firstpage_metadata(text, basefile))
return res
def find_firstpage_metadata(self, firstpage, basefile):
res = {}
m = re.search("proposition till riksdagen *,? *(.*?); gif?ven",
util.normalize_space(firstpage), flags=re.I)
if not m:
self.log.warning("%s: Couldn't find title in first %s characters (first page)" %
(basefile, len(firstpage)))
else:
res["dcterms:title"] = m.groups(1)
m = re.search("gif?ven stockholms slott den (\d+ \w+ \d{4})", util.normalize_space(firstpage), flags=re.I)
if not m:
self.log.warning("%s: Couldn't find date in first %s characters (first page)" %
(basefile, len(firstpage)))
else:
try:
res["dcterms:issued"] = self.parse_swedish_date(m.group(1).lower())
except ValueError as e:
self.log.warning("%s: Couldn't parse date %s" % (basefile, m.group(1)))
return res
def extract_body(self, fp, basefile):
reader = StreamingPDFReader()
parser = "ocr" if self.config.ocr else "xml"
intermediate_suffix = ".hocr" if self.config.ocr else ".xml"
if self.config.compress:
intermediate_suffix += "." + self.config.compress
reader.read(fp, parser=parser)
for attachment in [x for x in sorted(self.store.list_attachments(basefile, "downloaded")) if x.endswith(".pdf")]:
downloaded_path = self.store.downloaded_path(basefile, attachment=attachment)
iattachment = attachment.replace(".pdf", intermediate_suffix)
intermediate_path = self.store.intermediate_path(basefile, attachment=iattachment)
if not os.path.exists(intermediate_path):
fp = self.convert_pdf(downloaded_path, intermediate_path)
else:
fp = self.store.open_intermediate(basefile, attachment=iattachment)
reader += StreamingPDFReader().read(fp)
for page in reader:
page.src = "index.pdf" # FIXME: don't hardcode the filename
return reader
def postprocess_doc(self, doc):
if self.get_parse_options(doc.basefile) == "metadataonly":
return
# the first thing will be a Sidbrytning; continue scanning text until next sidbrytning
firstpage = ""
for thing in doc.body[1:]:
if isinstance(thing, Sidbrytning):
break
elif isinstance(thing, Textbox):
firstpage += util.normalize_space(str(thing)) + "\n\n"
metadata = self.find_firstpage_metadata(firstpage, doc.basefile)
if "dcterms:title" in metadata:
doc.meta.add((URIRef(doc.uri), DCTERMS.title, Literal(metadata["dcterms:title"], lang=self.lang)))
if "dcterms:issued" in metadata:
doc.meta.add((URIRef(doc.uri), DCTERMS.issued, Literal(metadata["dcterms:issued"])))
# inherit list_basefiles_for from CompositeStore, basefile_to_pathfrag
# from SwedishLegalStore)
class PropositionerStore(CompositeStore, SwedishLegalStore):
pass
[docs]class Propositioner(CompositeRepository, FixedLayoutSource):
subrepos = PropRegeringen, PropTrips, PropRiksdagen, PropKB
alias = "prop"
xslt_template = "xsl/forarbete.xsl"
storage_policy = "dir"
rdf_type = RPUBL.Proposition
documentstore_class = PropositionerStore
sparql_annotations = "sparql/describe-with-subdocs.rq"
sparql_expect_results = False
# NB: The same logic as in
# ferenda.sources.legal.se.{Regeringen,Riksdagen}.metadata_from_basefile
def metadata_from_basefile(self, basefile):
a = super(Propositioner, self).metadata_from_basefile(basefile)
a["rpubl:arsutgava"], a["rpubl:lopnummer"] = basefile.split(":", 1)
return a
def facets(self):
return super(Propositioner, self).facets() + [Facet(DCTERMS.title,
toplevel_only=False)]
def tabs(self):
if self.config.tabs:
return [('Propositioner', self.dataset_uri())]
else:
return []
# For a certain repo, download_path might return *.wpd (good) or
# *.html (bad, because unformatted plaintext). If it returns bad,
# we should continue with other repos that might have
# *.pdf. HOWEVER, if no other repo has it in any format, we'll
# have to accept the repo that has it as *.html.
#
# NOTE: This implementation does not make use of the
# self.store.basefiles[c] cache, since that only keeps track of
# which repos has which basefiles, not the format/quality of the
# source.
def get_preferred_instances(self, basefile):
backups = []
for c in self.subrepos:
inst = self.get_instance(c)
source_candidate = inst.store.downloaded_path(basefile)
if os.path.exists(source_candidate):
if c.alias != "propregeringen" and source_candidate.endswith(".html"):
backups.append(inst)
else:
yield(inst)
for inst in backups:
yield(inst)