# -*- coding: utf-8 -*-
from __future__ import (absolute_import, division,
print_function, unicode_literals)
from builtins import *
"""Hanterar domslut (detaljer och referat) från Domstolsverket. Data
hämtas fran DV:s (ickepublika) FTP-server, eller fran lagen.nu."""
# system libraries (incl six-based renames)
from bz2 import BZ2File
from collections import defaultdict
from datetime import datetime, timedelta
from ftplib import FTP
from io import BytesIO
from time import mktime
from urllib.parse import urljoin, urlparse
import codecs
import itertools
import logging
import os
import re
import tempfile
import zipfile
# 3rdparty libs
from cached_property import cached_property
from rdflib import Namespace, URIRef, Graph, RDF, RDFS, BNode
from rdflib.namespace import DCTERMS, SKOS, FOAF
import requests
import lxml.html
from lxml import etree
from bs4 import BeautifulSoup, NavigableString
try:
# this is a optional dependency that only works on py3 and which
# is only needed when multiple processes write to a single shared
# file (generated/uri.map) over NFS
from flufl.lock import Lock
except ImportError:
Lock = None
# my libs
from ferenda import (Document, DocumentStore, Describer, WordReader, FSMParser, Facet)
from ferenda.decorators import newstate, action
from ferenda import util, errors, fulltextindex
from ferenda.sources.legal.se.legalref import LegalRef
from ferenda.elements import (Body, Paragraph, CompoundElement, OrdinalElement,
Heading, Link)
from ferenda.elements.html import Strong, Em, Div, P
from . import SwedishLegalSource, SwedishCitationParser, RPUBL
from .elements import *
PROV = Namespace(util.ns['prov'])
class DVStore(DocumentStore):
"""Customized DocumentStore.
"""
downloaded_suffixes = [".docx", ".doc"]
def basefile_to_pathfrag(self, basefile):
return basefile
def pathfrag_to_basefile(self, pathfrag):
return pathfrag
class KeywordContainsDescription(errors.FerendaException):
def __init__(self, keywords, descriptions):
self.keywords = keywords
self.descriptions = descriptions
class DuplicateReferatDoc(errors.DocumentRemovedError):
pass
[docs]class DV(SwedishLegalSource):
"""Handles legal cases, in report form, from primarily final instance courts.
Cases are fetched from Domstolsverkets FTP server for "Vägledande
avgöranden", and are converted from doc/docx format.
"""
alias = "dv"
downloaded_suffix = ".zip"
rdf_type = (RPUBL.Rattsfallsreferat, RPUBL.Rattsfallsnotis)
documentstore_class = DVStore
# This is very similar to SwedishLegalSource.required_predicates,
# only DCTERMS.title has been changed to RPUBL.referatrubrik (and if
# our validating function grokked that rpubl:referatrubrik
# rdfs:isSubpropertyOf dcterms:title, we wouldn't need this). Also, we
# removed dcterms:issued because there is no actual way of getting
# this data (apart from like the file time stamps). On further
# thinking, we remove RPUBL.referatrubrik as it's not present (or
# required) for rpubl:Rattsfallsnotis
required_predicates = [RDF.type, DCTERMS.identifier, PROV.wasGeneratedBy]
DCTERMS = Namespace(util.ns['dcterms'])
sparql_annotations = "sparql/dv-annotations.rq"
sparql_expect_results = False
xslt_template = "xsl/dv.xsl"
@classmethod
def relate_all_setup(cls, config, *args, **kwargs):
# FIXME: If this was an instancemethod, we could use
# self.store methods instead
parsed_dir = os.path.sep.join([config.datadir, 'dv', 'parsed'])
mapfile = os.path.sep.join(
[config.datadir, 'dv', 'generated', 'uri.map'])
log = logging.getLogger(cls.alias)
if (not util.outfile_is_newer(util.list_dirs(parsed_dir, ".xhtml"), mapfile)) or config.force:
re_xmlbase = re.compile('<head about="([^"]+)"')
log.info("Creating uri.map file")
cnt = 0
# also remove any uri-<client>-<pid>.map files that might be laying around
for m in util.list_dirs(os.path.dirname(mapfile), ".map"):
if m == mapfile:
continue
util.robust_remove(m)
util.robust_remove(mapfile + ".new")
util.ensure_dir(mapfile)
# FIXME: Not sure utf-8 is the correct codec for us -- it
# might be iso-8859-1 (it's to be used by mod_rewrite).
with codecs.open(mapfile + ".new", "w", encoding="utf-8") as fp:
paths = set()
for f in util.list_dirs(parsed_dir, ".xhtml"):
if not os.path.getsize(f):
# skip empty files
continue
# get basefile from f in the simplest way
basefile = f[len(parsed_dir) + 1:-6]
head = codecs.open(f, encoding='utf-8').read(1024)
m = re_xmlbase.search(head)
if m:
path = urlparse(m.group(1)).path
if path in paths:
log.warning("Path %s is already in map" % path)
continue
assert path
assert basefile
if config.mapfiletype == "nginx":
fp.write("%s\t/dv/generated/%s.html;\n" % (path, basefile))
else:
# remove prefix "/dom/" from path
path = path.replace("/%s/" % cls.urispace_segment, "", 1)
fp.write("%s\t%s\n" % (path, basefile))
cnt += 1
paths.add(path)
else:
log.warning(
"%s: Could not find valid head[@about] in %s" %
(basefile, f))
util.robust_rename(mapfile + ".new", mapfile)
log.info("uri.map created, %s entries" % cnt)
else:
log.debug("Not regenerating uri.map")
pass
return super(DV, cls).relate_all_setup(config, *args, **kwargs)
# def relate(self, basefile, otherrepos): pass
@classmethod
def get_default_options(cls):
opts = super(DV, cls).get_default_options()
opts['ftpuser'] = '' # None # Doesn't work great since Defaults is a typesource...
opts['ftppassword'] = '' # None
opts['mapfiletype'] = 'apache' # or nginx
return opts
def canonical_uri(self, basefile):
# The canonical URI for HDO/B3811-03 should be
# https://lagen.nu/dom/nja/2004s510. We can't know
# this URI before we parse the document. Once we have, we can
# find the first rdf:type = rpubl:Rattsfallsreferat (or
# rpubl:Rattsfallsnotis) and get its url.
#
# FIXME: It would be simpler and faster to read
# DocumentEntry(self.store.entry_path(basefile))['id'], but
# parse does not yet update the DocumentEntry once it has the
# canonical uri/id for the document.
p = self.store.distilled_path(basefile)
if not os.path.exists(p):
raise ValueError("No distilled file for basefile %s at %s" % (basefile, p))
with self.store.open_distilled(basefile) as fp:
g = Graph().parse(data=fp.read())
for uri, rdftype in g.subject_objects(predicate=RDF.type):
if rdftype in (RPUBL.Rattsfallsreferat,
RPUBL.Rattsfallsnotis):
return str(uri)
raise ValueError("Can't find canonical URI for basefile %s in %s" % (basefile, p))
# we override make_document to avoid having it calling
# canonical_uri prematurely
def make_document(self, basefile=None):
doc = Document()
doc.basefile = basefile
doc.meta = self.make_graph()
doc.lang = self.lang
doc.body = Body()
doc.uri = None # can't know this yet
return doc
urispace_segment = "rf"
expected_cases = {"ra": 1993,
"nja": 1981,
"rh": 1993,
"ad": 1993,
"mod": 1999,
"md": 2004}
# override to account for the fact that there is no 1:1
# correspondance between basefiles and uris
def basefile_from_uri(self, uri):
def build_basefilemap(path, filename):
if self.config.mapfiletype == "nginx":
# chop of leading "/dom/"
path = path[len(self.urispace_segment)+2:]
# /dv/generated/HDO/T-254.html; => HDO/T-254
filename = filename[14:-6]
self._basefilemap[path] = filename
basefile = super(DV, self).basefile_from_uri(uri)
# the "basefile" at this point is just the remainder of the
# URI (eg "nja/1995s362"). Create a lookup table to find the
# real basefile (eg "HDO/Ö463-95_1")
if basefile:
if not hasattr(self, "_basefilemap"):
self._basefilemap = {}
self.readmapfile(build_basefilemap)
if basefile in self._basefilemap:
return self._basefilemap[basefile]
else:
# this will happen for older cases for which we don't
# have any files. We invent URI-derived basefiles for
# these to gain a sort of skeleton entry for those,
# which we can use to track eg. frequently referenced
# older cases.
# however, we check if we OUGHT to have a basefile
# (because it's recent enough) and warn.
court, year = basefile.split("/", 1)
year=int(year[:4])
if court not in self.expected_cases or self.expected_cases[court] <= year:
self.log.warning("%s: Could not find corresponding basefile" % uri)
return basefile.replace(":", "/")
def readmapfile(self, callback):
mapfile = self.store.path("uri", "generated", ".map")
util.ensure_dir(mapfile)
if self.config.clientname:
mapfiles = list(util.list_dirs(os.path.dirname(mapfile), ".map"))
else:
mapfiles = [mapfile]
if self.config.mapfiletype == "nginx":
regex = "/%s/(.*)\t/dv/generated/(.*).html;" % self.urispace_segment
else:
idx = len(self.urispace_base) + len(self.urispace_segment) + 2
regex = "(.*)\t(.*)"
append_path = True
for mapfile in mapfiles:
if os.path.exists(mapfile):
with codecs.open(mapfile, encoding="utf-8") as fp:
for line in fp:
path, filename = line.strip().split("\t", 1)
ret = callback(path, filename)
if ret is not None:
return ret
def download(self, basefile=None):
if basefile is not None:
raise ValueError("DV.download cannot process a basefile parameter")
# recurse =~ download everything, which we do if refresh is
# specified OR if we've never downloaded before
recurse = False
# if self.config.lastdownload has not been set, it has only
# the type value, so self.config.lastdownload will raise
# AttributeError. Should it return None instead?
if self.config.refresh or 'lastdownload' not in self.config:
recurse = True
self.downloadcount = 0 # number of files extracted from zip files
# (not number of zip files)
try:
if self.config.ftpuser:
self.download_ftp("", recurse,
self.config.ftpuser,
self.config.ftppassword)
else:
self.log.warning(
"Config variable ftpuser not set, downloading from secondary source (https://lagen.nu/dv/downloaded/) instead")
self.download_www("", recurse)
except errors.MaxDownloadsReached: # ok we're done!
pass
def download_ftp(self, dirname, recurse, user=None, password=None, connection=None):
self.log.debug('Listing contents of %s' % dirname)
lines = []
if not connection:
connection = FTP('ftp.dom.se')
connection.login(user, password)
connection.cwd(dirname)
connection.retrlines('LIST', lines.append)
for line in lines:
parts = line.split()
filename = parts[-1].strip()
if line.startswith('d') and recurse:
self.download_ftp(filename, recurse, connection=connection)
elif line.startswith('-'):
basefile = os.path.splitext(filename)[0]
if dirname:
basefile = dirname + "/" + basefile
# localpath = self.store.downloaded_path(basefile)
localpath = self.store.path(basefile, 'downloaded/zips', '.zip')
if os.path.exists(localpath) and not self.config.refresh:
pass # we already got this
else:
util.ensure_dir(localpath)
self.log.debug('Fetching %s to %s' % (filename,
localpath))
connection.retrbinary('RETR %s' % filename,
# FIXME: retrbinary calls .close()?
open(localpath, 'wb').write)
self.process_zipfile(localpath)
connection.cwd('/')
def download_www(self, dirname, recurse):
url = 'https://lagen.nu/dv/downloaded/%s' % dirname
self.log.debug('Listing contents of %s' % url)
resp = requests.get(url)
iterlinks = lxml.html.document_fromstring(resp.text).iterlinks()
for element, attribute, link, pos in iterlinks:
if link.startswith("/"):
continue
elif link.endswith("/") and recurse:
self.download_www(link, recurse)
elif link.endswith(".zip"):
basefile = os.path.splitext(link)[0]
if dirname:
basefile = dirname + basefile
# localpath = self.store.downloaded_path(basefile)
localpath = self.store.path(basefile, 'downloaded/zips', '.zip')
if os.path.exists(localpath) and not self.config.refresh:
pass # we already got this
else:
absolute_url = urljoin(url, link)
self.log.debug('Fetching %s to %s' % (link, localpath))
resp = requests.get(absolute_url)
with self.store.open(basefile, "downloaded/zips", ".zip", "wb") as fp:
fp.write(resp.content)
self.process_zipfile(localpath)
# eg. HDO_T3467-96.doc or HDO_T3467-96_1.doc
re_malnr = re.compile(r'([^_]*)_([^_\.]*)()_?(\d*)(\.docx?)')
# eg. HDO_T3467-96_BYTUT_2010-03-17.doc or
# HDO_T3467-96_BYTUT_2010-03-17_1.doc or
# HDO_T254-89_1_BYTUT_2009-04-28.doc (which is sort of the
# same as the above but the "_1" goes in a different place)
re_bytut_malnr = re.compile(
r'([^_]*)_([^_\.]*)_?(\d*)_BYTUT_\d+-\d+-\d+_?(\d*)(\.docx?)')
re_tabort_malnr = re.compile(
r'([^_]*)_([^_\.]*)_?(\d*)_TABORT_\d+-\d+-\d+_?(\d*)(\.docx?)')
# temporary helper
@action
def process_all_zipfiles(self):
self.downloadcount = 0
zippath = self.store.path('', 'downloaded/zips', '')
# Peocess zips in subdirs first (HDO, ADO
# etc), then in numerics-only order.
mykey = lambda v: (-len(v.split(os.sep)), "".join(c for c in v if c.isnumeric()))
zipfiles = sorted(util.list_dirs(zippath, suffix=".zip"), key=mykey)
for zipfilename in zipfiles:
self.log.info("%s: Processing..." % zipfilename)
self.process_zipfile(zipfilename)
@action
def process_zipfile(self, zipfilename):
"""Extract a named zipfile into appropriate documents"""
removed = replaced = created = untouched = 0
if not hasattr(self, 'downloadcount'):
self.downloadcount = 0
try:
zipf = zipfile.ZipFile(zipfilename, "r")
except zipfile.BadZipfile as e:
self.log.error("%s is not a valid zip file: %s" % (zipfilename, e))
return
for bname in zipf.namelist():
if not isinstance(bname, str): # py2
# Files in the zip file are encoded using codepage 437
name = bname.decode('cp437')
else:
name = bname
if "_notis_" in name:
base, suffix = os.path.splitext(name)
segments = base.split("_")
coll, year = segments[0], segments[1]
# Extract this doc as a temp file -- we won't be
# creating an actual permanent file, but let
# extract_notis extract individual parts of this file
# to individual basefiles
fp = tempfile.NamedTemporaryFile("wb", suffix=suffix, delete=False)
filebytes = zipf.read(bname)
fp.write(filebytes)
fp.close()
tempname = fp.name
r = self.extract_notis(tempname, year, coll)
created += r[0]
untouched += r[1]
os.unlink(tempname)
else:
name = os.path.split(name)[1]
if 'BYTUT' in name:
m = self.re_bytut_malnr.match(name)
elif 'TABORT' in name:
m = self.re_tabort_malnr.match(name)
else:
m = self.re_malnr.match(name)
if m:
(court, malnr, opt_referatnr, referatnr, suffix) = (
m.group(1), m.group(2), m.group(3), m.group(4), m.group(5))
assert ((suffix == ".doc") or (suffix == ".docx")
), "Unknown suffix %s in %r" % (suffix, name)
if referatnr:
basefile = "%s/%s_%s" % (court, malnr, referatnr)
elif opt_referatnr:
basefile = "%s/%s_%s" % (court, malnr, opt_referatnr)
else:
basefile = "%s/%s" % (court, malnr)
basefile = basefile.strip() # to avoid spurious trailing spaces in the filename before the file suffix
outfile = self.store.path(basefile, 'downloaded', suffix)
if "TABORT" in name:
self.log.info("%s: Removing" % basefile)
if not os.path.exists(outfile):
self.log.warning("%s: %s doesn't exist" % (basefile,
outfile))
else:
os.unlink(outfile)
removed += 1
elif "BYTUT" in name:
self.log.info("%s: download OK (replacing with new)" % basefile)
if not os.path.exists(outfile):
self.log.warning("%s: %s doesn't exist" %
(basefile, outfile))
replaced += 1
else:
self.log.info("%s: download OK (unpacking)" % basefile)
if os.path.exists(outfile):
untouched += 1
continue
else:
created += 1
if not "TABORT" in name:
data = zipf.read(bname)
with self.store.open(basefile, "downloaded", suffix, "wb") as fp:
fp.write(data)
# Make the unzipped files have correct timestamp
zi = zipf.getinfo(bname)
dt = datetime(*zi.date_time)
ts = mktime(dt.timetuple())
os.utime(outfile, (ts, ts))
self.downloadcount += 1
# fix HERE
if ('downloadmax' in self.config and
self.config.downloadmax and
self.downloadcount >= self.config.downloadmax):
raise errors.MaxDownloadsReached()
else:
self.log.warning('Could not interpret filename %r i %s' %
(name, os.path.relpath(zipfilename)))
self.log.debug('Processed %s, created %s, replaced %s, removed %s, untouched %s files' %
(os.path.relpath(zipfilename), created, replaced, removed, untouched))
def extract_notis(self, docfile, year, coll="HDO"):
def find_month_in_previous(basefile):
# The big word file with all notises might not
# start with a month name -- try to find out
# current month by examining the previous notis
# (belonging to a previous word file).
#
# FIXME: It's possible that the two word files might be
# different types (eg docx and doc). In that case the
# resulting file will contain both OOXML and DocBook tags.
self.log.warning(
"No month specified in %s, attempting to look in previous file" %
basefile)
# HDO/2009_not_26 -> HDO/2009_not_25
tmpfunc = lambda x: str(int(x.group(0)) - 1)
prev_basefile = re.sub('\d+$', tmpfunc, basefile)
prev_path = self.store.intermediate_path(prev_basefile)
avd_p = None
if os.path.exists(prev_path):
with self.store.open_intermediate(prev_basefile, "rb") as fp:
soup = BeautifulSoup(fp.read(), "lxml")
tmp = soup.find(["w:p", "para"])
if re_avdstart.match(tmp.get_text().strip()):
avd_p = tmp
if not avd_p:
raise errors.ParseError(
"Cannot find value for month in %s (looked in %s)" %
(basefile, prev_path))
return avd_p
# Given a word document containing a set of "notisfall" from
# either HD or HFD (earlier RegR), spit out a constructed
# intermediate XML file for each notis and create a empty
# placeholder downloaded file. The empty file should never be
# opened since parse_open will prefer the constructed
# intermediate file.
if coll == "HDO":
re_notisstart = re.compile(
"(?P<day>Den \d+:[ae]. |)(?P<ordinal>\d+)\s*\.\s*\((?P<malnr>\w\s\d+-\d+)\)",
flags=re.UNICODE)
re_avdstart = re.compile(
"(Januari|Februari|Mars|April|Maj|Juni|Juli|Augusti|September|Oktober|November|December)$")
else: # REG / HFD
re_notisstart = re.compile(
"[\w\: ]*Lnr:(?P<court>\w+) ?(?P<year>\d+) ?not ?(?P<ordinal>\d+)",
flags=re.UNICODE)
re_avdstart = None
created = untouched = 0
intermediatefile = os.path.splitext(docfile)[0] + ".xml"
with open(intermediatefile, "wb") as fp:
filetype = WordReader().read(docfile, fp)
soup = BeautifulSoup(util.readfile(intermediatefile), "lxml")
os.unlink(intermediatefile) # won't be needed past this point
if filetype == "docx":
p_tag = "w:p"
xmlns = ' xmlns:w14="http://schemas.microsoft.com/office/word/2010/wordml" xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main" xmlns:r="http://schemas.openxmlformats.org/officeDocument/2006/relationships"'
else:
p_tag = "para"
xmlns = ''
iterator = soup.find_all(p_tag, limit=2147483647)
basefile = None
fp = None
avd_p = None
day = None
intermediate_path = None
for p in iterator:
t = p.get_text().strip()
if re_avdstart:
# keep track of current month, store that in avd_p
m = re_avdstart.match(t)
if m:
avd_p = p
continue
m = re_notisstart.match(t)
if m:
ordinal = m.group("ordinal")
try:
if m.group("day"):
day = m.group("day")
else:
# inject current day in the first text node of
# p (which should inside of a <emphasis
# role="bold" or equivalent).
subnode = None
# FIXME: is this a deprecated method?
for c in p.recursiveChildGenerator():
if isinstance(c, NavigableString):
c.string.replace_with(day + str(c.string))
break
except IndexError:
pass
if intermediate_path:
previous_intermediate_path = intermediate_path
basefile = "%(coll)s/%(year)s_not_%(ordinal)s" % locals()
self.log.info("%s: Extracting from %s file" % (basefile, filetype))
created += 1
downloaded_path = self.store.path(basefile, 'downloaded', '.' + filetype)
util.ensure_dir(downloaded_path)
with open(downloaded_path, "w"):
pass # just create an empty placeholder file --
# parse_open will load the intermediate file
# anyway.
if fp:
fp.write(b"</body>\n")
fp.close()
util.ensure_dir(self.store.intermediate_path(basefile))
fp = self.store.open_intermediate(basefile, mode="wb")
bodytag = '<body%s>' % xmlns
fp.write(bodytag.encode("utf-8"))
if filetype != "docx":
fp.write(b"\n")
if coll == "HDO" and not avd_p:
avd_p = find_month_in_previous(basefile)
if avd_p:
fp.write(str(avd_p).encode("utf-8"))
if fp:
fp.write(str(p).encode("utf-8"))
if filetype != "docx":
fp.write(b"\n")
if fp: # should always be the case
fp.write(b"</body>\n")
fp.close()
else:
self.log.error("%s/%s: No notis were extracted (%s)" %
(coll, year, docfile))
return created, untouched
re_delimSplit = re.compile("[;,] ?").split
labels = {'Rubrik': DCTERMS.description,
'Domstol': DCTERMS.creator,
'Målnummer': RPUBL.malnummer,
'Domsnummer': RPUBL.domsnummer,
'Diarienummer': RPUBL.diarienummer,
'Avdelning': RPUBL.domstolsavdelning,
'Referat': DCTERMS.identifier,
'Avgörandedatum': RPUBL.avgorandedatum,
}
def remote_url(self, basefile):
# There is no publicly available URL where the source document
# could be fetched.
return None
def adjust_basefile(self, doc, orig_uri):
pass # See comments in swedishlegalsource.py
def parse_open(self, basefile, attachment=None):
intermediate_path = self.store.intermediate_path(basefile)
if not os.path.exists(intermediate_path):
fp = self.downloaded_to_intermediate(basefile)
else:
fp = self.store.open_intermediate(basefile, "rb")
# Determine if the previously-created intermediate files
# came from .doc or OOXML (.docx) sources by sniffing the
# first bytes.
start = fp.read(6)
assert isinstance(start, bytes), "fp seems to have been opened in a text-like mode"
if start in (b"<w:doc", b"<body "):
filetype = "docx"
elif start in (b"<book ", b"<book>", b"<body>"):
filetype = "doc"
else:
raise ValueError("Can't guess filetype from %r" % start)
fp.seek(0)
self.filetype = filetype
return self.patch_if_needed(fp, basefile)
def downloaded_to_intermediate(self, basefile, attachment=None):
assert "_not_" not in basefile, "downloaded_to_intermediate can't handle Notisfall %s" % basefile
docfile = self.store.downloaded_path(basefile)
intermediatefile = self.store.intermediate_path(basefile)
if os.path.getsize(docfile) == 0:
raise errors.ParseError("%s: Downloaded file %s is empty, %s should have "
"been created by download() but is missing!" %
(basefile, docfile, intermediatefile))
wr = WordReader()
fp = self.store.open_intermediate(basefile, mode="wb")
self.filetype = wr.read(docfile, fp, simplify=True)
# FIXME: Do something with filetype if it's not what we expect
fp.close()
if hasattr(fp, 'utime'):
os.utime(self.store.intermediate_path(basefile), (fp.utime, fp.utime))
# re-open in read mode -- since we can't open a compressed
# file in read-write mode
return self.store.open_intermediate(basefile, mode="rb")
def extract_head(self, fp, basefile):
filetype = self.filetype
patched = fp.read()
# rawhead is a simple dict that we'll later transform into a
# rdflib Graph. rawbody is a list of plaintext strings, each
# representing a paragraph.
if "not" in basefile:
rawhead, rawbody = self.parse_not(patched, basefile, filetype)
elif filetype == "docx":
rawhead, rawbody = self.parse_ooxml(patched, basefile)
else:
rawhead, rawbody = self.parse_antiword_docbook(patched, basefile)
# stash the body away for later reference
self._rawbody = rawbody
return rawhead
def extract_metadata(self, rawhead, basefile):
# we have already done all the extracting in extract_head
return rawhead
def parse_entry_title(self, doc):
# FIXME: The primary use for entry.title is to generate
# feeds. Should we construct a feed-friendly title here
# (rpubl:referatrubrik is often too wordy, dcterm:identifier +
# dcterms:subject might be a better choice -- also notisfall
# does not have any rpubl:referatrubrik)
title = doc.meta.value(URIRef(doc.uri), RPUBL.referatrubrik)
if title:
return str(title)
def extract_body(self, fp, basefile):
return self._rawbody
def sanitize_body(self, rawbody):
result = []
seen_delmal = {}
# ADO 1994 nr 102 to nr 113 have double \n between *EVERY
# LINE*, not between every paragraph. Lines are short, less
# than 60 chars. This leads to is_heading matching almost
# every chunk. The weirdest thing is that the specific line
# starting with "Ledamöter: " do NOT exhibit this trait... Try
# to detect and undo.
if (isinstance(rawbody[0], str) and # Notisfall rawbody is a list of lists...
max(len(line) for line in rawbody if not line.startswith("Ledamöter: ")) < 60):
self.log.warning("Source has double newlines between every line, attempting "
"to reconstruct sections")
newbody = []
currentline = ""
for idx, line in enumerate(rawbody):
if (line.isupper() or # this is a obvious header
(idx + 1 < len(rawbody) and rawbody[idx+1].isupper()) or # next line is a obvious header
(idx + 1 < len(rawbody) and # line is short and a probable sentence enter + next line starts with a new sentence
len(line) < 45 and
line[-1] in (".", "?", "0", "1", "2", "3", "4", "5", "6", "7", "8", "9") and
rawbody[idx+1][0].isupper()) or
(idx + 1 < len(rawbody) and re.match("\d\.\s+[A-ZÅÄÖ]", rawbody[idx+1])) # next line seem to be a ordered paragraph
):
newbody.append(currentline + "\n" + line)
currentline = ""
else:
currentline += "\n" + line
rawbody = newbody
for idx, x in enumerate(rawbody):
if isinstance(x, str):
# detect and fix smushed numbered sections which MD
# has, eg "18Marknadsandelar är..." ->
# "18. Marknadsandelar är..."
x = re.sub("^(\d{1,3})([A-ZÅÄÖ])", r"\1. \2", x)
m = re.match("(KÄRANDE|SVARANDE|SAKEN)([A-ZÅÄÖ].*)", x)
if m:
# divide smushed-together headings like MD has,
# eg. "SAKENMarknadsföring av bilverkstäder..."
x = [m.group(1), m.group(2)]
else:
# match smushed-together delmål markers like in "(jfr
# 1990 s 772 och s 796)I" and "Domslut HD fastställer
# HovR:ns domslut.II"
#
# But since we apparently need to handle spaces before
# "I", we might get false positives with sentences like
# "...och Dalarna. I\ndistributionsrörelsen
# sysselsattes...". Try to avoid this by checking for
# probable sentence start in next line
m = re.match("(.*[\.\) ])(I+)$", x, re.DOTALL)
if (m and rawbody[idx+1][0].isupper() and
not re.search("mellandomstema I+$", x, flags=re.IGNORECASE)):
x = [m.group(1), m.group(2)]
else:
x = [x]
for p in x:
m = re.match("(I{1,3}|IV)\.? ?(|\(\w+\-\d+\))$", p)
if m:
seen_delmal[m.group(1)] = True
result.append(Paragraph([p]))
else:
result.append(Paragraph(x))
# Many referats that are split into delmål lack the first
# initial "I" that signifies the start of the first delmål
# (but do have "II", "III" and possibly more)
if seen_delmal and "I" not in seen_delmal:
self.log.warning("Inserting missing 'I' for first delmal")
result.insert(0, Paragraph(["I"]))
return result
def parse_not(self, text, basefile, filetype):
basefile_regex = re.compile("(?P<type>\w+)/(?P<year>\d+)_not_(?P<ordinal>\d+)")
referat_templ = {'REG': 'RÅ %(year)s not %(ordinal)s',
'HDO': 'NJA %(year)s not %(ordinal)s',
'HFD': 'HFD %(year)s not %(ordinal)s'}
head = {}
body = []
m = basefile_regex.match(basefile).groupdict()
coll = m['type']
head["Referat"] = referat_templ[coll] % m
soup = BeautifulSoup(text, "lxml")
if filetype == "docx":
ptag = "w:p", "para" # support intermediate files with a
# mix of OOXML/DocBook
else:
ptag = "para", "w:p"
iterator = soup.find_all(ptag, limit=2147483647)
if coll == "HDO":
# keep this in sync w extract_notis
re_notisstart = re.compile(
"(?:Den (?P<avgdatum>\d+):[ae].\s+|)(?P<ordinal>\d+)\s*\.\s*\((?P<malnr>\w[ \xa0]\d+-\d+)\)",
flags=re.UNICODE)
re_avgdatum = re_malnr = re_notisstart
re_lagrum = re_sokord = None
# headers consist of the first two chunks. (month, then
# date+ordinal+malnr)
header = iterator.pop(0), iterator[0] # need to re-read the second line later
curryear = m['year']
currmonth = self.swedish_months[header[0].get_text().strip().lower()]
secondline = util.normalize_space(header[-1].get_text())
m = re_notisstart.match(secondline)
if m:
head["Rubrik"] = secondline[m.end():].strip()
if curryear == "2003": # notisfall in this year lack
# newline between heading and
# actual text, so we use a
# heuristic to just match
# first sentence
m2 = re.search("\. [A-ZÅÄÖ]", head["Rubrik"])
if m2:
# now we know where the first sentence ends. Only keep that.
head["Rubrik"] = head["Rubrik"][:m2.start()+1]
else: # "REG", "HFD"
# keep in sync like above
re_notisstart = re.compile(
"[\w\: ]*Lnr:(?P<court>\w+) ?(?P<year>\d+) ?not ?(?P<ordinal>\d+)")
re_malnr = re.compile(r"[AD][:-] ?(?P<malnr>\d+\-\d+)")
# the avgdatum regex attempts to include valid dates, eg
# not "2770-71-12".It's also somewhat tolerant of
# formatting mistakes, eg accepts " :03-06-16" instead of
# "A:03-06-16"
re_avgdatum = re.compile(r"[AD ]: ?(?P<avgdatum>\d{2,4}\-[01]\d\-\d{2})")
re_sokord = re.compile("Uppslagsord: ?(?P<sokord>.*)", flags=re.DOTALL)
re_lagrum = re.compile("Lagrum: ?(?P<lagrum>.*)", flags=re.DOTALL)
# headers consists of the first five or six
# chunks. Doesn't end until "^Not \d+."
header = []
done = False
while not done and iterator:
line = iterator[0].get_text().strip()
# can possibly be "Not 1a." (RÅ 1994 not 1) or
# "Not. 109." (RÅ 1998 not 109). There might be a
# space separating the notis from the next sentence,
# but there might also not be!
if re.match("Not(is|)\.? \d+[abc]?\.? ?", line):
done = True
if ". - " in line[:2000]:
# Split out "Not 56" and the first
# sentence up to ". -", signalling that is the
# equiv of referatrubrik
rubr = line.split(". - ", 1)[0]
rubr = re.sub("Not(is|)\.? \d+[abc]?\.? ", "", rubr)
head['Rubrik'] = rubr
else:
if line.endswith("Notisen har utgått."):
raise errors.DocumentRemovedError(basefile, dummyfile=self.store.parsed_path(basefile))
else:
tmp = iterator.pop(0)
if tmp.get_text().strip():
# REG specialcase
if header and header[-1].get_text().strip() == "Lagrum:":
# get the first bs4.element.Tag child
children = [x for x in tmp.children if hasattr(x, 'get_text')]
if children:
header[-1].append(children[0])
else:
header.append(tmp)
if not done:
raise errors.ParseError("Cannot find notis number in %s" % basefile)
if coll == "HDO":
head['Domstol'] = "Högsta Domstolen"
elif coll == "HFD":
head['Domstol'] = "Högsta förvaltningsdomstolen"
elif coll == "REG":
head['Domstol'] = "Regeringsrätten"
else:
raise errors.ParseError("Unsupported: %s" % coll)
for node in header:
t = util.normalize_space(node.get_text())
# if not malnr, avgdatum found, look for those
for fld, key, rex in (('Målnummer', 'malnr', re_malnr),
('Avgörandedatum', 'avgdatum', re_avgdatum),
('Lagrum', 'lagrum', re_lagrum),
('Sökord', 'sokord', re_sokord)):
if not rex:
continue
m = rex.search(t)
if m and m.group(key):
if fld in ('Lagrum'): # Sökord is split by sanitize_metadata
head[fld] = self.re_delimSplit(m.group(key))
else:
head[fld] = m.group(key)
if coll == "HDO" and 'Avgörandedatum' in head:
head[
'Avgörandedatum'] = "%s-%02d-%02d" % (curryear, currmonth, int(head['Avgörandedatum']))
# Do a basic conversion of the rest (bodytext) to Element objects
#
# This is generic enough that it could be part of WordReader
for node in iterator:
line = []
if filetype == "doc":
subiterator = node
elif filetype == "docx":
subiterator = node.find_all("w:r", limit=2147483647)
for part in subiterator:
if part.name:
t = part.get_text()
else:
t = str(part) # convert NavigableString to pure string
# if not t.strip():
# continue
if filetype == "doc" and part.name == "emphasis": # docbook
if part.get("role") == "bold":
if line and isinstance(line[-1], Strong):
line[-1][-1] += t
else:
line.append(Strong([t]))
else:
if line and isinstance(line[-1], Em):
line[-1][-1] += t
else:
line.append(Em([t]))
# ooxml
elif filetype == "docx" and part.find("w:rpr") and part.find("w:rpr").find(["w:b", "w:i"]):
rpr = part.find("w:rpr")
if rpr.find("w:b"):
if line and isinstance(line[-1], Strong):
line[-1][-1] += t
else:
line.append(Strong([t]))
elif rpr.find("w:i"):
if line and isinstance(line[-1], Em):
line[-1][-1] += t
else:
line.append(Em([t]))
else:
if line and isinstance(line[-1], str):
line[-1] += t
else:
line.append(t)
if line:
body.append(line)
return head, body
def parse_ooxml(self, text, basefile):
soup = BeautifulSoup(text, "lxml")
head = {}
# Högst uppe på varje domslut står domstolsnamnet ("Högsta
# domstolen") följt av referatnumret ("NJA 1987
# s. 113").
firstfield = soup.find("w:t")
# Ibland är domstolsnamnet uppsplittat på två
# w:r-element. Bäst att gå på all text i
# föräldra-w:tc-cellen
firstfield = firstfield.find_parent("w:tc")
head['Domstol'] = firstfield.get_text(strip=True)
nextfield = firstfield.find_next("w:tc")
head['Referat'] = nextfield.get_text(strip=True)
# Hitta övriga enkla metadatafält i sidhuvudet
for key in self.labels:
if key in head:
continue
node = soup.find(text=re.compile(key + ':'))
if not node:
# FIXME: should warn for missing Målnummer iff
# Domsnummer is not present, and vice versa. But at
# this point we don't have all fields
if key not in ('Diarienummer', 'Domsnummer', 'Avdelning', 'Målnummer'):
self.log.warning("%s: Couldn't find field %r" % (basefile, key))
continue
txt = "".join([n.get_text() for n in node.find_next("w:t").find_parent("w:p").find_all("w:t", limit=2147483647)])
if txt.strip(): # skippa fält med tomma strängen-värden (eller bara whitespace)
head[key] = txt
# Hitta sammansatta metadata i sidhuvudet
for key in ["Lagrum", "Rättsfall"]:
node = soup.find(text=re.compile(key + ':'))
if node:
textnodes = node.find_parent('w:tc').find_next_sibling('w:tc')
if not textnodes:
continue
items = []
for textnode in textnodes.find_all('w:t', limit=2147483647):
t = textnode.get_text(strip=True)
if t:
items.append(t)
if items:
head[key] = items
# The main text body of the verdict
body = []
for p in soup.find(text=re.compile('EFERAT')).find_parent(
'w:tr').find_next_sibling('w:tr').find_all('w:p', limit=2147483647):
ptext = ''
for e in p.find_all("w:t", limit=2147483647):
ptext += e.string
body.append(ptext)
# Finally, some more metadata in the footer
if soup.find(text=re.compile(r'Sökord:')):
head['Sökord'] = soup.find(
text=re.compile(r'Sökord:')).find_next('w:t').get_text(strip=True)
if soup.find(text=re.compile('^\s*Litteratur:\s*$')):
n = soup.find(text=re.compile('^\s*Litteratur:\s*$'))
head['Litteratur'] = n.findNext('w:t').get_text(strip=True)
return head, body
def parse_antiword_docbook(self, text, basefile):
soup = BeautifulSoup(text, "lxml")
head = {}
header_elements = soup.find("para")
header_text = ''
for el in header_elements.contents:
if hasattr(el, 'name') and el.name == "informaltable":
break
else:
header_text += el.string
# Högst uppe på varje domslut står domstolsnamnet ("Högsta
# domstolen") följt av referatnumret ("NJA 1987
# s. 113"). Beroende på worddokumentet ser dock XML-strukturen
# olika ut. Det vanliga är att informationen finns i en
# pipeseparerad paragraf:
parts = [x.strip() for x in header_text.split("|")]
if len(parts) > 1:
head['Domstol'] = parts[0]
head['Referat'] = parts[1]
else:
# alternativ står de på första raden i en informaltable
row = soup.find("informaltable").tgroup.tbody.row.find_all('entry')
head['Domstol'] = row[0].get_text(strip=True)
head['Referat'] = row[1].get_text(strip=True)
# Hitta övriga enkla metadatafält i sidhuvudet
for key in self.labels:
node = soup.find(text=re.compile(key + ':'))
if node:
txt = node.find_parent('entry').find_next_sibling(
'entry').get_text(strip=True)
if txt:
head[key] = txt
# Hitta sammansatta metadata i sidhuvudet
for key in ["Lagrum", "Rättsfall"]:
node = soup.find(text=re.compile(key + ':'))
if node:
head[key] = []
textchunk = node.find_parent(
'entry').find_next_sibling('entry').string
for line in [util.normalize_space(x) for x in textchunk.split("\n\n")]:
if line:
head[key].append(line)
body = []
for p in soup.find(text=re.compile('REFERAT')).find_parent('tgroup').find_next_sibling(
'tgroup').find('entry').get_text(strip=True).split("\n\n"):
body.append(p)
# Hitta sammansatta metadata i sidfoten
head['Sökord'] = soup.find(text=re.compile('Sökord:')).find_parent(
'entry').next_sibling.next_sibling.get_text(strip=True)
if soup.find(text=re.compile('^\s*Litteratur:\s*$')):
n = soup.find(text=re.compile('^\s*Litteratur:\s*$')).find_parent(
'entry').next_sibling.next_sibling.get_text(strip=True)
head['Litteratur'] = n
return head, body
# correct broken/missing metadata
def sanitize_metadata(self, head, basefile):
basefile_regex = re.compile('(?P<type>\w+)/(?P<year>\d+)-(?P<ordinal>\d+)')
nja_regex = re.compile(
"NJA ?(\d+) ?s\.? ?(\d+) *\( ?(?:NJA|) ?[ :]?(\d+) ?: ?(\d+)")
date_regex = re.compile("(\d+)[^\d]+(\d+)[^\d]+(\d+)")
referat_regex = re.compile(
"(?P<type>[A-ZÅÄÖ]+)[^\d]*(?P<year>\d+)[^\d]+(?P<ordinal>\d+)")
referat_templ = {'ADO': 'AD %(year)s nr %(ordinal)s',
'AD': '%(type)s %(year)s nr %(ordinal)s',
'MDO': 'MD %(year)s:%(ordinal)s',
'NJA': '%(type)s %(year)s s. %(ordinal)s',
None: '%(type)s %(year)s:%(ordinal)s'
}
# 0. strip whitespace
for k, v in head.items():
if isinstance(v, str):
head[k] = v.strip()
# 1. Attempt to fix missing Referat
if not head.get("Referat"):
# For some courts (MDO, ADO) it's possible to reconstruct a missing
# Referat from the basefile
m = basefile_regex.match(basefile)
if m and m.group("type") in ('ADO', 'MDO'):
head["Referat"] = referat_templ[m.group("type")] % (m.groupdict())
# 2. Correct known problems with Domstol not always being correctly specified
if "Hovrättenför" in head["Domstol"] or "Hovrättenöver" in head["Domstol"]:
head["Domstol"] = head["Domstol"].replace("Hovrätten", "Hovrätten ")
try:
# if this throws a KeyError, it's not canonically specified
self.lookup_resource(head["Domstol"], cutoff=1)
except KeyError:
# lookup URI with fuzzy matching, then turn back to canonical label
head["Domstol"] = self.lookup_label(str(self.lookup_resource(head["Domstol"], warn=False)))
# 3. Convert head['Målnummer'] to a list. Occasionally more than one
# Malnummer is provided (c.f. AD 1994 nr 107, AD
# 2005-117, AD 2003-111) in a comma, semicolo or space
# separated list. AD 2006-105 even separates with " och ".
#
# HOWEVER, "Ö 2475-12" must not be converted to ["Ö2475-12"], not ['Ö', '2475-12']
if head.get("Målnummer"):
if head["Målnummer"][:2] in ('Ö ', 'B ', 'T '):
head["Målnummer"] = [head["Målnummer"].replace(" ", "")]
else:
res = []
for v in re.split("och|,|;|\s", head['Målnummer']):
if v.strip():
res.append(v.strip())
head['Målnummer'] = res
# 4. Create a general term for Målnummer or Domsnummer to act
# as a local identifier
if head.get("Målnummer"):
head["_localid"] = head["Målnummer"]
elif head.get("Domsnummer"):
# NB: localid needs to be a list
head["_localid"] = [head["Domsnummer"]]
else:
raise errors.ParseError("Required key (Målnummer/Domsnummer) missing")
# 5. For NJA, Canonicalize the identifier through a very
# forgiving regex and split of the alternative identifier
# as head['_nja_ordinal']
#
# "NJA 2008 s 567 (NJA 2008:86)"=>("NJA 2008 s 567", "NJA 2008:86")
# "NJA 2011 s. 638(NJA2011:57)" => ("NJA 2011 s 638", "NJA 2001:57")
# "NJA 2012 s. 16(2012:2)" => ("NJA 2012 s 16", "NJA 2012:2")
if "NJA" in head["Referat"] and " not " not in head["Referat"]:
m = nja_regex.match(head["Referat"])
if m:
head["Referat"] = "NJA %s s %s" % (m.group(1), m.group(2))
head["_nja_ordinal"] = "NJA %s:%s" % (m.group(3), m.group(4))
else:
raise errors.ParseError("Unparseable NJA ref '%s'" % head["Referat"])
# 6 Canonicalize referats: Fix crap like "AD 2010nr 67",
# "AD2011 nr 17", "HFD_2012 ref.58", "RH 2012_121", "RH2010
# :180", "MD:2012:5", "MIG2011:14", "-MÖD 2010:32" and many
# MANY more
if " not " not in head["Referat"]: # notiser always have OK Referat
m = referat_regex.search(head["Referat"])
if m:
if m.group("type") in referat_templ:
head["Referat"] = referat_templ[m.group("type")] % m.groupdict()
else:
head["Referat"] = referat_templ[None] % m.groupdict()
elif basefile.split("/")[0] in ('ADO', 'MDO'):
# FIXME: The same logic as under 1, duplicated
m = basefile_regex.match(basefile)
head["Referat"] = referat_templ[m.group("type")] % (m.groupdict())
else:
raise errors.ParseError("Unparseable ref '%s'" % head["Referat"])
# 7. Convert Sökord string to an actual list
if head.get("Sökord"):
try:
res = self.sanitize_sokord(head["Sökord"], basefile)
except KeywordContainsDescription as e:
res = e.keywords
rubrik = " / ".join(e.descriptions)
# I don't really know if this turns out to be a good
# idea, but let's try it
if "Rubrik" in head and "_not_" in basefile:
self.log.debug("%s: Changing rubrik %s -> %s (is that better?)" % (basefile, head["Rubrik"], rubrik))
head["Rubrik"] = rubrik
head["Sökord"] = res
# 8. Convert Avgörandedatum to a sensible value in the face of
# irregularities like '2010-11 30', '2011 03-23' '2011-
# 01-27', '2009.08.28' or '07-12-28'
m = date_regex.match(head["Avgörandedatum"])
if m:
if len(m.group(1)) < 4:
if int(m.group(1)) >= 80: # '80-01-01' => '1980-01-01',
year = '19' + m.group(1)
else: # '79-01-01' => '2079-01-01',
year = '20' + m.group(1)
else:
year = m.group(1)
head["Avgörandedatum"] = "%s-%s-%s" % (year, m.group(2), m.group(3))
else:
raise errors.ParseError("Unparseable date %s" % head["Avgörandedatum"])
# 9. Done!
return head
# Strings that might look like descriptions but actually are legit
# keywords (albeit very wordy keywords)
sokord_whitelist = ("Rättsprövning enligt lagen (2006:304) om rättsprövning av vissa regeringsbeslut",)
def sanitize_sokord(self, sokordstring, basefile):
def capitalize(s):
# remove any non-word char start (like "- ", which
# sometimes occur due to double-dashes)
s = re.sub("^\W+", "", s)
return util.ucfirst(s)
def probable_description(s):
# FIXME: try to determine if this is sentence-like
# (eg. containing common verbs like "ansågs", containing
# more than x words etc)
return (s not in self.sokord_whitelist) and len(s) >= 50
res = []
descs = []
if basefile.startswith("XXX/"):
delimiter = ","
else:
delimiter = ";"
for s in sokordstring.split(delimiter):
s = util.normalize_space(s)
if not s:
continue
# normalize the delimiter between the main keyword and
# subkeyword for some common variations like "Allmän
# handling, allmän handling eller inte", "Allmän handling?
# (brev till biskop i pastoral angelägenhet)" or "Allmän
# handling -övriga frågor?"
if " - " not in s:
s = re.sub("(Allmän handling|Allmän försäkring|Arbetsskadeförsäkring|Besvärsrätt|Byggnadsmål|Plan- och bygglagen|Förhandsbesked|Resning)[:,?]?\s+\(?(.*?)\)?$", r"\1 - \2", s)
subres = []
substrings = s.split(" - ")
for idx, subs in enumerate(substrings):
# often, what should be keywords is more of
# descriptions, never occurring more than once. Try to
# identify these pseudo-descriptions (which sometimes
# are pretty good as descriptions go)
if not probable_description(subs):
subres.append(capitalize(subs))
else:
if idx + 1 != len(substrings):
self.log.warning("%s: Found probable description %r in sökord, but not at last position" % (basefile, subs))
descs.append(capitalize(subs))
res.append(tuple(subres))
if descs:
# the remainder is not a legit keyword term. However, it
# might be a useful title. Communicate it back to the
# caller (but if we have several, omit shorter substrings
# of longer descs)
descs.sort(key=len)
descs = [desc for idx, desc in enumerate(descs) if (idx + 1) == len(descs) or desc not in descs[idx+1]]
raise KeywordContainsDescription(res, descs)
return res
@cached_property
def rattsfall_parser(self):
return SwedishCitationParser(LegalRef(LegalRef.RATTSFALL, LegalRef.EURATTSFALL),
self.minter,
self.commondata)
@cached_property
def lagrum_parser(self):
return SwedishCitationParser(LegalRef(LegalRef.LAGRUM, LegalRef.EULAGSTIFTNING),
self.minter,
self.commondata)
@cached_property
def litteratur_parser(self):
return SwedishCitationParser(LegalRef(LegalRef.FORARBETEN),
self.minter,
self.commondata)
# create nice RDF from the sanitized metadata
def polish_metadata(self, head):
def ref_to_uri(ref):
nodes = self.rattsfall_parser.parse_string(ref)
assert isinstance(nodes[0], Link), "Can't make URI from '%s'" % ref
return nodes[0].uri
def split_nja(value):
return [x[:-1] for x in value.split("(")]
# 1. mint uris and create the two Describers we'll use
graph = self.make_graph()
refuri = ref_to_uri(head["Referat"])
refdesc = Describer(graph, refuri)
for malnummer in head['_localid']:
bnodetmp = BNode()
gtmp = Graph()
gtmp.bind("rpubl", RPUBL)
gtmp.bind("dcterms", DCTERMS)
dtmp = Describer(gtmp, bnodetmp)
dtmp.rdftype(RPUBL.VagledandeDomstolsavgorande)
dtmp.value(RPUBL.malnummer, malnummer)
dtmp.value(RPUBL.avgorandedatum, head['Avgörandedatum'])
dtmp.rel(DCTERMS.publisher, self.lookup_resource(head["Domstol"]))
resource = dtmp.graph.resource(bnodetmp)
domuri = self.minter.space.coin_uri(resource)
domdesc = Describer(graph, domuri)
# 2. convert all strings in head to proper RDF (well, some is
# converted to mixed-content lists and stored in self._bodymeta
self._bodymeta = defaultdict(list)
for label, value in head.items():
if label == "Rubrik":
value = util.normalize_space(value)
refdesc.value(RPUBL.referatrubrik, value, lang="sv")
domdesc.value(DCTERMS.title, value, lang="sv")
elif label == "Domstol":
with domdesc.rel(DCTERMS.publisher, self.lookup_resource(value)):
# NB: Here, we take the court name as provided in
# the source document as the value for the
# foaf:name triple. We could also look it up in
# self.commondata, but since we already have it...
domdesc.value(FOAF.name, value)
elif label == "Målnummer":
for v in value:
# FIXME: In these cases (multiple målnummer, which
# primarily occurs with AD), we should really
# create separate domdesc objects (there are two
# verdicts, just summarized in one document)
domdesc.value(RPUBL.malnummer, v)
elif label == "Domsnummer":
domdesc.value(RPUBL.domsnummer, value)
elif label == "Diarienummer":
domdesc.value(RPUBL.diarienummer, value)
elif label == "Avdelning":
domdesc.value(RPUBL.avdelning, value)
elif label == "Referat":
for pred, regex in {'rattsfallspublikation': r'([^ ]+)',
'arsutgava': r'(\d{4})',
'lopnummer': r'\d{4}(?:\:| nr | not )(\d+)',
'sidnummer': r's.? ?(\d+)'}.items():
m = re.search(regex, value)
if m:
if pred == 'rattsfallspublikation':
uri = self.lookup_resource(m.group(1),
predicate=SKOS.altLabel,
cutoff=1)
refdesc.rel(RPUBL[pred], uri)
else:
refdesc.value(RPUBL[pred], m.group(1))
refdesc.value(DCTERMS.identifier, value)
elif label == "_nja_ordinal":
refdesc.value(DCTERMS.bibliographicCitation,
value)
m = re.search(r'\d{4}(?:\:| nr | not )(\d+)', value)
if m:
refdesc.value(RPUBL.lopnummer, m.group(1))
elif label == "Avgörandedatum":
domdesc.value(RPUBL.avgorandedatum, self.parse_iso_date(value))
# The following metadata (Lagrum, Rättsfall and
# Litteratur) is handled slightly differently -- it's not
# added to the RDF graph (doc.meta) that will later be
# addede to the XHTML <head> section. Instead, it's saved
# in a struct which will later be added to doc.body. This
# is so that we can represent mixed content metadata
# (links, linktexts and unlinked text)
elif label == "Lagrum":
for i in value: # better be list not string
# if (re.search("\d+/\d+/(EU|EG|EEG)", i) or
# re.search("\((EU|EG|EEG)\) nr \d+/\d+", i) or
# " direktiv" in i or " förordning" in i):
# self.log.warning("%s(%s): Lagrum ref to EULaw: '%s'" %
# (head.get("Referat"), head.get("Målnummer"), i))
self._bodymeta[label].append(self.lagrum_parser.parse_string(i,
predicate="rpubl:lagrum"))
elif label == "Rättsfall":
for i in value:
self._bodymeta[label].append(self.rattsfall_parser.parse_string(i,
predicate="rpubl:rattsfallshanvisning"))
elif label == "Litteratur":
if value:
for i in value.split(";"):
self._bodymeta[label].append(self.litteratur_parser.parse_string(i,
predicate="dcterms:relation"))
elif label == "Sökord":
for s in value:
self.add_keyword_to_metadata(domdesc, s)
# 3. mint some owl:sameAs URIs -- but only if not using canonical URIs
# (moved to lagen.nu.DV)
# 4. Add some same-for-everyone properties
refdesc.rel(DCTERMS.publisher, self.lookup_resource('Domstolsverket'))
if 'not' in head['Referat']:
refdesc.rdftype(RPUBL.Rattsfallsnotis)
else:
refdesc.rdftype(RPUBL.Rattsfallsreferat)
domdesc.rdftype(RPUBL.VagledandeDomstolsavgorande)
refdesc.rel(RPUBL.referatAvDomstolsavgorande, domuri)
refdesc.value(PROV.wasGeneratedBy, self.qualified_class_name())
self._canonical_uri = refuri
return refdesc.graph.resource(refuri)
def add_keyword_to_metadata(self, domdesc, keyword):
# Canonical uris don't define a URI space for
# keywords/concepts. Instead refer to bnodes
# with rdfs:label set (cf. rdl/documentation/exempel/
# documents/publ/Domslut/HD/2009/T_170-08.rdf).
#
# Subclasses that has an idea of how to create a URI for a
# keyword/concept might override this.
assert isinstance(keyword, tuple), "Keyword %s should have been a tuple of sub-keywords (possible 1-tuple)"
with domdesc.rel(DCTERMS.subject):
# if subkeywords, create a label like "Allmän handling»Begärd handling saknades"
domdesc.value(RDFS.label, "»".join(keyword), lang=self.lang)
def postprocess_doc(self, doc):
if self.config.mapfiletype == "nginx":
path = urlparse(doc.uri).path
else:
idx = len(self.urispace_base) + len(self.urispace_segment) + 2
path = doc.uri[idx:]
def map_append_needed(mapped_path, filename):
if mapped_path == path:
# This means that a previously parsed, basefile
# already maps to the same URI (eg because a referat
# of multiple dom documents occur as several different
# (identical) basefiles. If it's a different basefile
# (and not just the same parsed twice), raise
# DuplicateReferatDoc
try:
# convert generated path to a basefile if possible
basefile = filename.split("/",3)[-1].split(".")[0]
except:
basefile = filename
if doc.basefile != basefile:
raise DuplicateReferatDoc(basefile, dummyfile=self.store.parsed_path(doc.basefile))
return False
# the result of readmapfile can be either False (the case in
# question already appeared in the uri.map file(s), or None
# (the case was not found anywhere -- we should
# append it to the appropriate uri.map file)
append_needed = self.readmapfile(map_append_needed)
if append_needed is not False:
if self.config.clientname:
# in a distributed setting, use a
# uri-<clientname>-<pid>.map, eg
# "uri-sophie-4435.map", to avoid corruption of a
# single file by multiple writer, or slowness due to
# lock contention.
mapfile = self.store.path("uri", "generated", ".%s.%s.map" % (self.config.clientname, os.getpid()))
else:
mapfile = self.store.path("uri", "generated", ".map")
with codecs.open(mapfile, "a", encoding="utf-8") as fp:
if self.config.mapfiletype == "nginx":
fp.write("%s\t/dv/generated/%s.html;\n" % (path,
doc.basefile))
else:
fp.write("%s\t%s\n" % (path, doc.basefile))
if hasattr(self, "_basefilemap"):
delattr(self, "_basefilemap")
# NB: This cannot be made to work 100% as there is not a 1:1
# mapping between basefiles and URIs since multiple basefiles
# might map to the same URI (those basefiles should be
# equivalent though). Examples:
# HDO/B883-81_1 -> https://lagen.nu/rf/nja/1982s350 -> HDO/B882-81_1
# HFD/1112-14 -> https://lagen.nu/rf/hfd/2014:35 -> HFD/1113-14
#
# However, we detect that above and throw a
# DuplicateReferatDoc error for the second (or third, or
# fourth...) basefile encountered.
computed_basefile = self.basefile_from_uri(doc.uri)
assert doc.basefile == computed_basefile, "%s -> %s -> %s" % (doc.basefile, doc.uri, computed_basefile)
# remove empty Instans objects (these can happen when both a
# separate heading, as well as a clue in a paragraph,
# indicates a new court).
roots = []
for node in doc.body:
if isinstance(node, Delmal):
roots.append(node)
if roots == []:
roots.append(doc.body)
for root in roots:
for node in list(root):
if isinstance(node, Instans) and len(node) == 0:
# print("Removing Instans %r" % node.court)
root.remove(node)
# add information from _bodymeta to doc.body
bodymeta = Div(**{'class': 'bodymeta',
'about': str(doc.meta.value(URIRef(doc.uri), RPUBL.referatAvDomstolsavgorande))})
for k, v in sorted(self._bodymeta.items()):
d = Div(**{'class': k})
for i in v:
d.append(P(i))
bodymeta.append(d)
doc.body.insert(0, bodymeta)
def infer_identifier(self, basefile):
p = self.store.distilled_path(basefile)
if not os.path.exists(p):
raise ValueError("No distilled file for basefile %s at %s" % (basefile, p))
with self.store.open_distilled(basefile) as fp:
g = Graph().parse(data=fp.read())
uri = self.canonical_uri(basefile)
return str(g.value(URIRef(uri), DCTERMS.identifier))
def parse_body_parseconfigs(self):
return ("default", "simple")
# @staticmethod
def get_parser(self, basefile, sanitized, parseconfig="default"):
re_courtname = re.compile(
"^(Högsta domstolen|Hovrätten (över|för)[A-ZÅÄÖa-zåäö ]+|([A-ZÅÄÖ][a-zåäö]+ )(tingsrätt|hovrätt))(|, mark- och miljödomstolen|, Mark- och miljööverdomstolen)$")
# productions = {'karande': '..',
# 'court': '..',
# 'date': '..'}
# at parse time, initialize matchers
rx = (
{'name': 'fr-överkl',
're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade (beslutet|domen) '
'till (?P<court>(Förvaltningsrätten|Länsrätten|Kammarrätten) i \w+(| län)'
'(|, migrationsdomstolen|, Migrationsöverdomstolen)|'
'Högsta förvaltningsdomstolen)( \((?P<date>\d+-\d+-\d+), '
'(?P<constitution>[\w\.\- ,]+)\)|$)',
'method': 'match',
'type': ('instans',),
'court': ('REG', 'HFD', 'MIG')},
{'name': 'fr-dom',
're': '(?P<court>(Förvaltningsrätten|'
'Länsrätten|Kammarrätten) i \w+(| län)'
'(|, migrationsdomstolen|, Migrationsöverdomstolen)|'
'Högsta förvaltningsdomstolen) \((?P<date>\d+-\d+-\d+), '
'(?P<constitution>[\w\.\- ,]+)\)',
'method': 'match',
'type': ('dom',),
'court': ('REG', 'HFD', 'MIG')},
{'name': 'tr-dom',
're': '(?P<court>TR:n|Tingsrätten|HovR:n|Hovrätten|Mark- och miljödomstolen) \((?P<constitution>[\w\.\- ,]+)\) (anförde|fastställde|stadfäste|meddelade) (följande i |i beslut i |i |)(dom|beslut) (d\.|d|den) (?P<date>\d+ \w+\.? \d+)',
'method': 'match',
'type': ('dom',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'hd-dom',
're': 'Målet avgjordes efter huvudförhandling (av|i) (?P<court>HD) \((?P<constitution>[\w:\.\- ,]+)\),? som',
'method': 'match',
'type': ('dom',),
'court': ('HDO',)},
{'name': 'hd-dom2',
're': '(?P<court>HD) \((?P<constitution>[\w:\.\- ,]+)\) meddelade den (?P<date>\d+ \w+ \d+) följande',
'method': 'match',
'type': ('dom',),
'court': ('HDO',)},
{'name': 'hd-fastst',
're': '(?P<court>HD) \((?P<constitution>[\w:\.\- ,]+)\) (beslöt|fattade (slutligt|följande slutliga) beslut)',
'method': 'match',
'type': ('dom',),
'court': ('HDO',)},
{'name': 'mig-dom',
're': '(?P<court>Kammarrätten i Stockholm, Migrationsöverdomstolen) \((?P<date>\d+-\d+-\d+), (?P<constitution>[\w\.\- ,]+)\)',
'method': 'match',
'type': ('dom',),
'court': ('MIG',)},
{'name': 'miv-forstainstans',
're': '(?P<court>Migrationsverket) avslog (ansökan|ansökningarna) den (?P<date>\d+ \w+ \d+) och beslutade att',
'method': 'match',
'type': ('dom',),
'court': ('MIG',)},
{'name': 'miv-forstainstans-2',
're': '(?P<court>Migrationsverket) avslog den (?P<date>\d+ \w+ \d+) A:s ansökan och beslutade att',
'method': 'match',
'type': ('dom',),
'court': ('MIG',)},
{'name': 'mig-dom-alt',
're': 'I sin dom avslog (?P<court>Förvaltningsrätten i Stockholm, migrationsdomstolen) \((?P<date>\d+- ?\d+-\d+), (?P<constitution>[\w\.\- ,]+)\)',
'method': 'match',
'type': ('dom',),
'court': ('MIG',)},
{'name': 'allm-åkl',
're': 'Allmän åklagare yrkade (.*)vid (?P<court>(([A-ZÅÄÖ]'
'[a-zåäö]+ )+)(TR|tingsrätt))',
'method': 'match',
'type': ('instans',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'stämning',
're': 'stämning å (?P<svarande>.*) vid (?P<court>(([A-ZÅÄÖ]'
'[a-zåäö]+ )+)(TR|tingsrätt))',
'method': 'search',
'type': ('instans',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'ansökan',
're': 'ansökte vid (?P<court>(([A-ZÅÄÖ][a-zåäö]+ )+)'
'(TR|tingsrätt)) om ',
'method': 'search',
'type': ('instans',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'riksåkl',
're': 'Riksåklagaren väckte i (?P<court>HD|HovR:n (över|för) '
'([A-ZÅÄÖ][a-zåäö]+ )+|[A-ZÅÄÖ][a-zåäö]+ HovR) åtal',
'method': 'match',
'type': ('instans',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'tr-överkl',
're': '(?P<karande>[\w\.\(\)\- ]+) (fullföljde talan|'
'överklagade) (|TR:ns dom.*)i (?P<court>HD|(HovR:n|hovrätten) '
'(över|för) (Skåne och Blekinge|Västra Sverige|Nedre '
'Norrland|Övre Norrland)|(Svea|Göta) (HovR|hovrätt))',
'method': 'match',
'type': ('instans',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'fullfölj-överkl',
're': '(?P<karanden>[\w\.\(\)\- ]+) fullföljde sin talan$',
'method': 'match',
'type': ('instans',)},
{'name': 'myndighetsansökan',
're': 'I (ansökan|en ansökan|besvär) hos (?P<court>\w+) '
'(om förhandsbesked|yrkade)',
'method': 'match',
'type': ('instans',),
'court': ('REG', 'HFD')},
{'name': 'myndighetsbeslut',
're': '(?P<court>\w+) beslutade (därefter |)(den (?P<date>\d+ \w+ \d+)|'
'[\w ]+) att',
'method': 'match',
'type': ('instans',),
'court': ('REG', 'HFD', 'MIG')},
{'name': 'myndighetsbeslut2',
're': '(?P<court>[\w ]+) (bedömde|vägrade) i (bistånds|)beslut'
' (|den (?P<date>\d+ \w+ \d+))',
'method': 'match',
'type': ('instans',),
'court': ('REG', 'HFD')},
{'name': 'hd-revision',
're': '(?P<karanden>[\w\.\(\)\- ]+) sökte revision och yrkade(,'
'i första hand,|, såsom hans talan fick förstås,|,|) att (?P<court>HD|)',
'method': 'match',
'type': ('instans',),
'court': ('HDO',)},
{'name': 'hd-revision2',
're': '(?P<karanden>[\w\.\(\)\- ]+) sökte revision$',
'method': 'match',
'type': ('instans',),
'court': 'HDO'},
{'name': 'hd-revision3',
're': '(?P<karanden>[\w\.\(\)\- ]+) sökte revision och framställde samma yrkanden',
'method': 'match',
'type': ('instans',),
'court': 'HDO'},
{'name': 'överklag-bifall',
're': '(?P<karanden>[\w\.\(\)\- ]+) (anförde besvär|'
'överklagade) och yrkade bifall till (sin talan i '
'(?P<prevcourt>HovR:n|TR:n)|)',
'method': 'match',
'type': ('instans',),
'court': ('HDO', 'HGO', 'HNN', 'HON', 'HSB', 'HSV', 'HVS')},
{'name': 'överklag-2',
're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade '
'(för egen del |)och yrkade (i själva saken |)att '
'(?P<court>HD|HovR:n|kammarrätten|Regeringsrätten|)',
'method': 'match',
'type': ('instans',)},
{'name': 'överklag-3',
're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade (?P<prevcourt>'
'\w+)s (beslut|omprövningsbeslut|dom)( i ersättningsfrågan|) (hos|till) '
'(?P<court>[\w\, ]+?)( och yrkade| och anförde|, som| \(Sverige\)|$)',
'method': 'match',
'type': ('instans',)},
{'name': 'överklag-4',
're': '(?!Även )(?P<karanden>(?!HD fastställer)[\w\.\(\)\- ]+) överklagade ((?P<prevcourt>\w+)s (beslut|dom)|beslutet|domen)( och|$)',
'method': 'match',
'type': ('instans',)},
{'name': 'hd-ansokan',
're': '(?P<karanden>[\w\.\(\)\- ]+) anhöll i ansökan som inkom '
'till (?P<court>HD) d \d+ \w+ \d+',
'method': 'match',
'type': ('instans',),
'court': ('HDO',)},
{'name': 'hd-skrivelse',
're': '(?P<karanden>[\w\.\(\)\- ]+) anförde i en till '
'(?P<court>HD) den \d+ \w+ \d+ ställd',
'method': 'match',
'type': ('instans',),
'court': ('HDO',)},
{'name': 'överklag-5',
're': '(?!Även )(?P<karanden>[\w\.\(\)\- ]+?) överklagade '
'(?P<prevcourt>\w+)s (dom|domar)',
'method': 'match',
'type': ('instans',)},
{'name': 'överklag-6',
're': '(?P<karanden>[\w\.\(\)\- ]+) överklagade domen till '
'(?P<court>\w+)($| och yrkade)',
'method': 'match',
'type': ('instans',)},
{'name': 'myndighetsbeslut3',
're': 'I sitt beslut den (?P<date>\d+ \w+ \d+) avslog '
'(?P<court>\w+)',
'method': 'match',
'type': ('instans',),
'court': ('REG', 'HFD', 'MIG')},
{'name': 'domskal',
're': "(Skäl|Domskäl|HovR:ns domskäl|Hovrättens domskäl)(\. |$)",
'method': 'match',
'type': ('domskal',)},
{'name': 'domskal-ref',
're': "(Tingsrätten|TR[:\.]n|Hovrätten|HD|Högsta förvaltningsdomstolen) \([^)]*\) (meddelade|anförde|fastställde|yttrade)",
'method': 'match',
'type': ('domskal',)},
{'name': 'domskal-dom-fr', # a simplified copy of fr-överkl
're': '(?P<court>(Förvaltningsrätten|'
'Länsrätten|Kammarrätten) i \w+(| län)'
'(|, migrationsdomstolen|, Migrationsöverdomstolen)|'
'Högsta förvaltningsdomstolen) \((?P<date>\d+-\d+-\d+), '
'(?P<constitution>[\w\.\- ,]+)\),? yttrade',
'method': 'match',
'type': ('domskal',)},
{'name': 'domslut-standalone',
're': '(Domslut|(?P<court>Hovrätten|HD|hd|Högsta förvaltningsdomstolen):?s avgörande)$',
'method': 'match',
'type': ('domslut',)},
{'name': 'domslut-start',
're': '(?P<court>[\w ]+(domstolen|rätten))s avgörande$',
'method': 'match',
'type': ('domslut',)}
)
court = basefile.split("/")[0]
matchers = defaultdict(list)
matchersname = defaultdict(list)
for pat in rx:
if 'court' not in pat or court in pat['court']:
for t in pat['type']:
# print("Adding pattern %s to %s" % (pat['name'], t))
matchers[t].append(
getattr(
re.compile(
pat['re'],
re.UNICODE),
pat['method']))
matchersname[t].append(pat['name'])
def is_delmal(parser, chunk=None):
# should handle "IV", "I (UM1001-08)" and "I." etc
# but not "I DEFINITION" or "Inledning"...
if chunk:
strchunk = str(chunk)
else:
strchunk = str(parser.reader.peek()).strip()
if len(strchunk) < 20:
m = re.match("(I{1,3}|IV)\.? ?(|\(\w+\-\d+\))$", strchunk)
if m:
res = {'id': m.group(1)}
if m.group(2):
res['malnr'] = m.group(2)[1:-1]
return res
return {}
def is_instans(parser, chunk=None):
"""Determines whether the current position starts a new instans part
of the report.
"""
chunk = parser.reader.peek()
strchunk = str(chunk)
res = analyze_instans(strchunk)
# sometimes, HD domskäl is written in a way that mirrors
# the referat of the lower instance (eg. "1. Optimum
# ansökte vid Lunds tingsrätt om stämning mot..."). If the
# instans progression goes from higher->lower court,
# something is amiss.
if (hasattr(parser, 'current_instans') and
parser.current_instans.court == "Högsta domstolen" and
isinstance(res.get('court'), str) and "tingsrätt" in res['court']):
return False
if res:
# in some referats, two subsequent chunks both matches
# analyze_instans, even though they refer to the _same_
# instans. Check to see if that is the case
if (hasattr(parser, 'current_instans') and
hasattr(parser.current_instans, 'court') and
parser.current_instans.court and
is_equivalent_court(res['court'],
parser.current_instans.court)):
return {}
else:
return res
elif parser._state_stack == ['body']:
# if we're at root level, *anything* starts a new instans
return True
else:
return {}
def is_equivalent_court(newcourt, oldcourt):
# should handle a bunch of cases
# >>> is_equivalent_court("Göta Hovrätt", "HovR:n")
# True
# >>> is_equivalent_court("HD", "Högsta domstolen")
# True
# >>> is_equivalent_court("Linköpings tingsrätt", "HovR:n")
# False
# >>> is_equivalent_court(True, "Högsta domstolen")
# True
# if newcourt is True:
# return newcourt
newcourt = canonicalize_court(newcourt)
oldcourt = canonicalize_court(oldcourt)
if newcourt is True and str(oldcourt) in ('Högsta domstolen'):
# typically an effect of both parties appealing to the
# supreme court
return True
if newcourt == oldcourt:
return True
else:
return False
def canonicalize_court(courtname):
if isinstance(courtname, bool):
return courtname # we have no idea which court this
# is, only that it is A court
else:
return courtname.replace(
"HD", "Högsta domstolen").replace("HovR", "Hovrätt")
def is_heading(parser):
chunk = parser.reader.peek()
strchunk = str(chunk)
if not strchunk.strip():
return False
# a heading is reasonably short and does not end with a
# period (or other sentence ending typography)
return len(strchunk) < 140 and not (strchunk.endswith(".") or
strchunk.endswith(":") or
strchunk.startswith("”"))
def is_betankande(parser):
strchunk = str(parser.reader.peek())
return strchunk in ("Målet avgjordes efter föredragning.",
"HD avgjorde målet efter föredragning.")
def is_dom(parser):
strchunk = str(parser.reader.peek())
res = analyze_dom(strchunk)
return res
def is_domskal(parser):
strchunk = str(parser.reader.peek())
res = analyze_domskal(strchunk)
return res
def is_domslut(parser):
strchunk = str(parser.reader.peek())
return analyze_domslut(strchunk)
def is_skiljaktig(parser):
strchunk = str(parser.reader.peek())
return re.match(
"(Justitie|Kammarrätts)råde[nt] ([^\.]*) var (skiljaktig|av skiljaktig mening)", strchunk)
def is_tillagg(parser):
strchunk = str(parser.reader.peek())
return re.match(
"Justitieråde[nt] ([^\.]*) (tillade för egen del|gjorde för egen del ett tillägg)", strchunk)
def is_endmeta(parser):
strchunk = str(parser.reader.peek())
return re.match("HD:s (beslut|dom|domar) meddela(de|d|t): den", strchunk)
def is_paragraph(parser):
return True
# Turns out, this is really difficult if you consider
# abbreviations. This particular heuristic splits on periods
# only (Sentences ending with ? or ! are rare in legal text)
# and only if followed by a capital letter (ie next sentence)
# or EOF. Does not handle things like "Mr. Smith" but that's
# also rare in swedish text. However, needs to handle "Linder,
# Erliksson, referent, och C. Bohlin", so another heuristic is
# that the sentence before can't end in a single capital
# letter.
def split_sentences(text):
text = util.normalize_space(text)
text += " "
return [x.strip() for x in re.split("(?<![A-ZÅÄÖ])\. (?=[A-ZÅÄÖ]|$)", text)]
def analyze_instans(strchunk):
res = {}
# Case 1: Fixed headings indicating new instance
if re_courtname.match(strchunk):
res['court'] = strchunk
res['complete'] = True
return res
else:
# case 2: common wording patterns indicating new
# instance
# "H.T. sökte revision och yrkade att HD måtte fastställa" =>
# <Instans name="HD"><str>H.T. sökte revision och yrkade att <PredicateSubject rel="HD" uri="http://lagen.nu/org/2008/hogsta-domstolen/">HD>/PredicateSubject>
# <div class="instans" rel="dc:creator" href="..."
# the needed sentence is usually 1st or 2nd
# (occassionally 3rd), searching more yields risk of
# false positives.
sentences = split_sentences(strchunk)[:3]
# In rare cases (HDO/T50-91_1) a chunk might be a
# Domskäl, but the second sentence looks like the
# start of an Instans. Since recognizers come in a
# particular order, is_instans is run before
# is_domskal, we must detect this false positive
domskal_match = matchers['domskal'][matchersname['domskal'].index('domskal')]
if domskal_match(sentences[0]):
return res
for sentence in sentences:
for (r, rname) in zip(matchers['instans'], matchersname['instans']):
m = r(sentence)
if m:
# print("analyze_instans: Matcher '%s' succeeded on '%s'" % (rname, sentence))
mg = m.groupdict()
if 'court' in mg and mg['court']:
res['court'] = mg['court'].strip()
else:
res['court'] = True
# if 'prevcourt' in mg and mg['prevcourt']:
# res['prevcourt'] = mg['prevcourt'].strip()
if 'date' in mg and mg['date']:
parse_swed = DV().parse_swedish_date
parse_iso = DV().parse_iso_date
try:
res['date'] = parse_swed(mg['date'])
except ValueError:
res['date'] = parse_iso(mg['date'])
return res
return res
def analyze_dom(strchunk):
res = {}
# special case for "referat" who are nothing but straight verdict documents.
if strchunk.strip() == "SAKEN":
return {'court': True}
# probably only the 1st sentence is interesting
for sentence in split_sentences(strchunk)[:1]:
for (r, rname) in zip(matchers['dom'], matchersname['dom']):
m = r(sentence)
if m:
# print("analyze_dom: Matcher '%s' succeeded on '%s': %r" % (rname, sentence,m.groupdict()))
mg = m.groupdict()
if 'court' in mg and mg['court']:
res['court'] = mg['court'].strip()
if 'date' in mg and mg['date']:
parse_swed = DV().parse_swedish_date
parse_iso = DV().parse_iso_date
try:
res['date'] = parse_swed(mg['date'])
except ValueError:
try:
res['date'] = parse_iso(mg['date'])
except ValueError:
pass
# or res['date'] = mg['date']??
# if 'constitution' in mg:
# res['constitution'] = parse_constitution(mg['constitution'])
return res
return res
def analyze_domskal(strchunk):
res = {}
# only 1st sentence
for sentence in split_sentences(strchunk)[:1]:
for (r, rname) in zip(matchers['domskal'], matchersname['domskal']):
m = r(sentence)
if m:
# print("analyze_domskal: Matcher '%s' succeeded on '%s'" % (rname, sentence))
res['domskal'] = True
return res
return res
def analyze_domslut(strchunk):
res = {}
# only 1st sentence
for sentence in split_sentences(strchunk)[:1]:
for (r, rname) in zip(matchers['domslut'], matchersname['domslut']):
m = r(sentence)
if m:
# print("analyze_domslut: Matcher '%s' succeeded on '%s'" % (rname, sentence))
mg = m.groupdict()
if 'court' in mg and mg['court']:
res['court'] = mg['court'].strip()
else:
res['court'] = True
return res
return res
def parse_constitution(strchunk):
res = []
for thing in strchunk.split(", "):
if thing in ("ordförande", "referent"):
res[-1]['position'] = thing
elif thing.startswith("ordförande ") or thing.startswith("ordf "):
pos, name = thing.split(" ", 1)
if name.startswith("t f lagmannen"):
title, name = name[:13], name[14:]
elif name.startswith("hovrättsrådet"):
title, name = name[:13], name[14:]
else:
title = None
r = {'name': name,
'position': pos,
'title': title}
if 'title' not in r:
del r['title']
res.append(r)
else:
name = thing
res.append({'name': name})
# also filter nulls
return res
# FIXME: This and make_paragraph ought to be expressed as
# generic functions in the ferenda.fsmparser module
@newstate('body')
def make_body(parser):
return parser.make_children(Body())
@newstate('delmal')
def make_delmal(parser):
attrs = is_delmal(parser, parser.reader.next())
if hasattr(parser, 'current_instans'):
delattr(parser, 'current_instans')
d = Delmal(ordinal=attrs['id'], malnr=attrs.get('malnr'))
return parser.make_children(d)
@newstate('instans')
def make_instans(parser):
chunk = parser.reader.next()
strchunk = str(chunk)
idata = analyze_instans(strchunk)
# idata may be {} if the special toplevel rule in is_instans applied
if 'complete' in idata:
i = Instans(court=strchunk)
court = strchunk
elif 'court' in idata and idata['court'] is not True:
i = Instans([chunk], court=idata['court'])
court = idata['court']
else:
i = Instans([chunk], court=parser.defaultcourt)
court = parser.defaultcourt
if court is None:
court = "" # we might need to calculate the courts len() below
# FIXME: ugly hack, but is_instans needs access to this
# object...
parser.current_instans = i
res = parser.make_children(i)
# might need to adjust the court parameter based on better
# information in the parse tree
for child in res:
if isinstance(child, Dom) and hasattr(child, 'court'):
# longer courtnames are better
if len(str(child.court)) > len(court):
i.court = child.court
return res
def make_heading(parser):
# a heading is by definition a single line
return Heading(parser.reader.next())
@newstate('betankande')
def make_betankande(parser):
b = Betankande()
b.append(parser.reader.next())
return parser.make_children(b)
@newstate('dom')
def make_dom(parser):
# fix date, constitution etc. Note peek() instead of read() --
# this is so is_domskal can have a chance at the same data
ddata = analyze_dom(str(parser.reader.peek()))
d = Dom(avgorandedatum=ddata.get('date'),
court=ddata.get('court'),
malnr=ddata.get('caseid'))
return parser.make_children(d)
@newstate('domskal')
def make_domskal(parser):
d = Domskal()
return parser.make_children(d)
@newstate('domslut')
def make_domslut(parser):
d = Domslut()
return parser.make_children(d)
@newstate('skiljaktig')
def make_skiljaktig(parser):
s = Skiljaktig()
s.append(parser.reader.next())
return parser.make_children(s)
@newstate('tillagg')
def make_tillagg(parser):
t = Tillagg()
t.append(parser.reader.next())
return parser.make_children(t)
@newstate('endmeta')
def make_endmeta(parser):
m = Endmeta()
m.append(parser.reader.next())
return parser.make_children(m)
def make_paragraph(parser):
chunk = parser.reader.next()
strchunk = str(chunk)
if not strchunk.strip(): # filter out empty things
return None
if parser.has_ordered_paras and ordered(strchunk):
# FIXME: Cut the ordinal from chunk somehow
if isinstance(chunk, Paragraph):
chunks = list(chunk)
chunks[0] = re.sub("^\s*\d+\. ", "", chunks[0])
p = OrderedParagraph(chunks, ordinal=ordered(strchunk))
else:
chunk = re.sub("^\s*\d+\. ", "", chunk)
p = OrderedParagraph([chunk], ordinal=ordered(strchunk))
else:
if isinstance(chunk, Paragraph):
p = chunk
else:
p = Paragraph([chunk])
return p
def ordered(chunk):
"""Given a string that might be a ordered paragraph, return the
ordinal if so, or None otherwise.x
"""
# most ordered paras use "18. Blahonga". But when quoting
# EU law, sometimes "18 Blahonga". Treat these the same.
# NOTE: It should not match eg "24hPoker är en
# bolagskonstruktion..." (HDO/B2760-09)
m = re.match("(\d+)\.?\s", chunk)
if m:
return m.group(1)
def transition_domskal(symbol, statestack):
if 'betankande' in statestack:
# Ugly hack: mangle the statestack so that *next time*
# we encounter a is_domskal, we pop the statestack,
# but for now we push to it.
# FIXME: This made TestDV.test_parse_HDO_O2668_07 fail
# since is_dom wasn't amongst the possible recognizers
# when "HD (...) fattade slutligt beslut i enlighet
# [...]" was up. I don't know if this logic is needed
# anymore, but removing it does not cause test
# failures.
# statestack[statestack.index('betankande')] = "__done__"
return make_domskal, "domskal"
else:
# here's where we pop the stack
return False, None
p = FSMParser()
# "dom" should not really be a commonstate (it should
# theoretically alwawys be followed by domskal or maybe
# domslut) but in some cases, the domskal merges with the
# start of dom in such a way that we can't transition into
# domskal right away (eg HovR:s dom in HDO/B10-86_1 and prob
# countless others)
commonstates = (
"body",
"delmal",
"instans",
"dom",
"domskal",
"domslut",
"betankande",
"skiljaktig",
"tillagg")
if parseconfig == "simple":
p.set_recognizers(is_paragraph)
p.set_transitions({
("body", is_paragraph): (make_paragraph, None)
})
p.has_ordered_paras = False
else:
p.set_recognizers(is_delmal,
is_endmeta,
is_instans,
is_dom,
is_betankande,
is_domskal,
is_domslut,
is_skiljaktig,
is_tillagg,
is_heading,
is_paragraph)
p.set_transitions({
("body", is_delmal): (make_delmal, "delmal"),
("body", is_instans): (make_instans, "instans"),
("body", is_endmeta): (make_endmeta, "endmeta"),
("delmal", is_instans): (make_instans, "instans"),
("delmal", is_delmal): (False, None),
("delmal", is_endmeta): (False, None),
("instans", is_betankande): (make_betankande, "betankande"),
("instans", is_domslut): (make_domslut, "domslut"),
("instans", is_dom): (make_dom, "dom"),
("instans", is_instans): (False, None),
("instans", is_skiljaktig): (make_skiljaktig, "skiljaktig"),
("instans", is_tillagg): (make_tillagg, "tillagg"),
("instans", is_delmal): (False, None),
("instans", is_endmeta): (False, None),
# either (make_domskal, "domskal") or (False, None)
("betankande", is_domskal): transition_domskal,
("betankande", is_domslut): (make_domslut, "domslut"),
("betankande", is_dom): (False, None),
("__done__", is_domskal): (False, None),
("__done__", is_skiljaktig): (False, None),
("__done__", is_tillagg): (False, None),
("__done__", is_delmal): (False, None),
("__done__", is_endmeta): (False, None),
("__done__", is_domslut): (make_domslut, "domslut"),
("dom", is_domskal): (make_domskal, "domskal"),
("dom", is_domslut): (make_domslut, "domslut"),
("dom", is_instans): (False, None),
("dom", is_skiljaktig): (False, None), # Skiljaktig mening is not considered
# part of the dom, but rather an appendix
("dom", is_tillagg): (False, None),
("dom", is_endmeta): (False, None),
("dom", is_delmal): (False, None),
("domskal", is_delmal): (False, None),
("domskal", is_domslut): (False, None),
("domskal", is_instans): (False, None),
("domslut", is_delmal): (False, None),
("domslut", is_instans): (False, None),
("domslut", is_domskal): (False, None),
("domslut", is_skiljaktig): (False, None),
("domslut", is_tillagg): (False, None),
("domslut", is_endmeta): (False, None),
("domslut", is_dom): (False, None),
("skiljaktig", is_domslut): (False, None),
("skiljaktig", is_instans): (False, None),
("skiljaktig", is_skiljaktig): (False, None),
("skiljaktig", is_tillagg): (False, None),
("skiljaktig", is_delmal): (False, None),
("skiljaktig", is_endmeta): (False, None),
("tillagg", is_tillagg): (False, None),
("tillagg", is_delmal): (False, None),
("tillagg", is_endmeta): (False, None),
("endmeta", is_paragraph): (make_paragraph, None),
(commonstates, is_heading): (make_heading, None),
(commonstates, is_paragraph): (make_paragraph, None),
})
# only NJA and MD cases (distinguished by the first three
# chars of basefile) can have ordered paragraphs
p.has_ordered_paras = basefile[:3] in ('HDO', 'MDO')
# parser configuration that is identical between the 'default'
# and 'simple' parser
p.initial_state = "body"
p.initial_constructor = make_body
p.debug = os.environ.get('FERENDA_FSMDEBUG', False)
# In some cases it's difficult to determine court from document alone.
p.defaultcourt = {'PMD': 'Patent- och marknadsöverdomstolen',
'MMD': 'Mark- och miljööverdomstolen'}.get(basefile.split("/")[0])
# return p
return p.parse
# FIXME: Get this information from self.commondata and the slugs
# file. However, that data does not contain lower-level
# courts. For now, we use the slugs used internally at
# Domstolsverket, but lower-case. Also, this list does not attempt
# to bridge when a court changes name (eg. LST and FST are
# distinct, even though they refer to the "same" court). In the
# case of adminstrative decisions, this also includes slugs for
# commmon administrative agencies.
courtslugs = {
"Skatterättsnämnden": "SRN",
"Skatteverket": "SKV",
"Migrationsverket": "MIV",
"PTS": "PTS",
"Attunda tingsrätt": "TAT",
"Blekinge tingsrätt": "TBL",
"Bollnäs TR": "TBOL",
"Borås tingsrätt": "TBOR",
"Eskilstuna tingsrätt": "TES",
"Eslövs TR": "TESL",
"Eksjö TR": "TEK",
"Falu tingsrätt": "TFA",
"Försäkringskassan": "FSK",
"Förvaltningsrätten i Göteborg": "FGO",
"Förvaltningsrätten i Göteborg, migrationsdomstolen": "MGO",
"Förvaltningsrätten i Malmö": "FMA",
"Förvaltningsrätten i Malmö, migrationsdomstolen": "MFM",
"Förvaltningsrätten i Stockholm": "FST",
"Förvaltningsrätten i Stockholm, migrationsdomstolen": "MFS",
"Gotlands tingsrätt": "TGO",
"Gävle tingsrätt": "TGA",
"Göta hovrätt": "HGO",
"Göteborgs TR": "TGO",
"Göteborgs tingsrätt": "TGO",
"Halmstads tingsrätt": "THA",
"Helsingborgs tingsrätt": "THE",
"Hudiksvalls tingsrätt": "THU",
"Jönköpings tingsrätt": "TJO",
"Kalmar tingsrätt": "TKA",
"Kammarrätten i Sundsvall": "KSU",
"Kristianstads tingsrätt": "TKR",
"Linköpings tingsrätt": "TLI",
"Ljusdals TR": "TLJ",
"Luleå tingsrätt": "TLU",
"Lunds tingsrätt": "TLU",
"Lycksele tingsrätt": "TLY",
"Länsrätten i Dalarnas län": "LDA",
"Länsrätten i Göteborg": "LGO",
"Länsrätten i Jämtlands län": "LJA",
"Länsrätten i Kopparbergs län": "LKO",
"Länsrätten i Kronobergs län": "LKR",
"Länsrätten i Malmöhus län": "LMAL",
"Länsrätten i Mariestad": "LMAR",
"Länsrätten i Norbottens län": "LNO",
"Länsrätten i Skaraborgs län": "LSK",
"Länsrätten i Skåne län": "LSK",
"Länsrätten i Stockholms län": "LST",
"Länsrätten i Stockholms län, migrationsdomstolen": "MLS",
"Länsrätten i Södermanlands län": "LSO",
"Länsrätten i Uppsala län": "LUP",
"Länsrätten i Vänersborg": "LVAN",
"Länsrätten i Värmlands län": "LVAR",
"Länsrätten i Västerbottens län": "LVAB",
"Länsrätten i Västmanlands län": "LVAL",
"Länsrätten i Älvsborgs län": "LAL",
"Malmö TR": "TMA",
"Malmö tingsrätt": "TMA",
"Mariestads tingsrätt": "TMAR",
"Mora tingsrätt": "TMO",
"Nacka tingsrätt": "TNA",
"Norrköpings tingsrätt": "TNO",
"Nyköpings tingsrätt": "TNY",
"Skaraborgs tingsrätt": "TSK",
"Skövde TR": "TSK",
"Solna tingsrätt": "TSO",
"Stockholms TR": "TST",
"Stockholms tingsrätt": "TST",
"Sundsvalls tingsrätt": "TSU",
"Svea hovrätt, Mark- och miljööverdomstolen": "MHS",
"Södertälje tingsrätt": "TSE",
"Södertörns tingsrätt": "TSN",
"Södra Roslags TR": "TSR",
"Uddevalla tingsrätt": "TUD",
"Umeå tingsrätt": "TUM",
"Uppsala tingsrätt": "TUP",
"Varbergs tingsrätt": "TVAR",
"Vänersborgs tingsrätt": "TVAN",
"Värmlands tingsrätt": "TVARM",
"Västmanlands tingsrätt": "TVAS",
"Växjö tingsrätt": "TVA",
"Ångermanlands tingsrätt": "TAN",
"Örebro tingsrätt": "TOR",
"Östersunds tingsrätt": "TOS",
"Kammarrätten i Jönköping": "KJO",
"Kammarrätten i Göteborg": "KGO",
"Kammarrätten i Stockholm": "KST",
"Göta HovR": "HGO",
"HovR:n för Nedre Norrland": "HNN",
"HovR:n för Västra Sverige": "HVS",
"HovR:n för Övre Norrland": "HON",
"HovR:n över Skåne och Blekinge": "HSB",
"Hovrätten för Nedre Norrland": "HNN",
"Hovrätten för Västra Sverige": "HVS",
"Hovrätten för Västra Sverige": "HVS",
"Hovrätten för Övre Norrland": "HON",
"Hovrätten över Skåne och Blekinge": "HSB",
"Hovrätten över Skåne och Blekinge": "HSB",
"Svea HovR": "HSV",
"Svea hovrätt": "HSV",
# supreme courts generally use abbrevs established by
# Vägledande rättsfall.
"Kammarrätten i Stockholm, Migrationsöverdomstolen": "MIG",
"Migrationsöverdomstolen": "MIG",
"Högsta förvaltningsdomstolen": "HFD",
"Regeringsrätten": "REGR", # REG is "Regeringen"
"Högsta domstolen": "HDO",
"HD": "HDO",
"arbetsdomstolen": "ADO",
"Mark- och miljööverdomstolen": "MMD",
"Patentbesvärsrätten": "PBR",
"Patent- och marknadsöverdomstolen": "PMÖD",
# for when the type of court, but not the specific court, is given
"HovR:n": "HovR",
"Hovrätten": "HovR",
"Kammarrätten": "KamR",
"Länsrätten": "LR",
"TR:n": "TR",
"Tingsrätten": "TR",
"länsrätten": "LR",
"miljödomstolen": "MID",
"tingsrätten": "TR",
"Länsstyrelsen": "LST",
"Marknadsdomstolen": "MD",
"Migrationsdomstolen": "MID",
"fastighetsdomstolen": "FD",
"förvaltningsrätten": "FR",
"hovrätten": "HovR",
"kammarrätten": "KamR",
# additional courts/agencies
"Alingsås TR": "TAL",
"Alingsås tingsrätt": "TAL",
"Arbetslöshetskassan": "ALK",
"Arvika TR": "TAR",
"Banverket": "BAN",
"Bodens TR": "TBO",
"Bollnäs tingsrätt": "TBO",
"Borås TR": "TBO",
"Byggnadsnämnden": "BYN",
"Datainspektionen": "DI",
"Eksjö tingsrätt": "TEK",
"Energimyndigheten": "ENM",
"Enköpings TR": "TEN",
"Enköpings tingsrätt": "TEN",
"Eskilstuna TR": "TES",
"Falköpings TR": "TFA",
"Falköpings tingsrätt": "TFA",
"Falu TR": "TFA",
"Fastighetsmäklarnämnden": "FMN",
"Fastighetstaxeringsnämnden": "FTN",
"Finansinspektionen": "FI",
"Forskarskattenämnden": "FSN",
"Förvaltningsrätten i Falun": "FFA",
"Förvaltningsrätten i Härnösand": "FHA",
"Förvaltningsrätten i Jönköping": "FJO",
"Förvaltningsrätten i Karlstad": "FKA",
"Förvaltningsrätten i Linköping": "FLI",
"Förvaltningsrätten i Luleå": "FLU",
"Förvaltningsrätten i Luleå, migrationsdomstolen": "FLUM",
"Förvaltningsrätten i Skåne län": "FSK",
"Förvaltningsrätten i Umeå": "FUM",
"Förvaltningsrätten i Uppsala": "FUP",
"Förvaltningsrätten i Växjö": "FVA",
"Gotlands TR": "TGO",
"Gällivare TR": "TGÄ",
"Gällivare tingsrätt": "TGÄ",
"Gävle TR": "TGÄ",
"Hallsbergs TR": "THA",
"Halmstads TR": "THA",
"Handens TR": "THA",
"Handens tingsrätt": "THA",
"Haparanda TR": "THA",
"Haparanda tingsrätt": "THA",
"Hedemora TR": "THE",
"Helsingborgs TR": "THE",
"Huddinge TR": "THU",
"Huddinge tingsrätt": "THU",
"Hudiksvalls TR": "THU",
"Härnösands TR": "THÄ",
"Härnösands tingsrätt": "THÄ",
"Hässleholms TR": "THÄ",
"Hässleholms tingsrätt": "THÄ",
"Invandrarverket": "INV",
"Jakobsbergs TR": "TJA",
"Jordbruksverket": "JBV",
"Jämtbygdens TR": "TJÄ",
"Jönköpings TR": "TJÖ",
"Kalmar TR": "TKA",
"Kammarkollegiet": "KK",
"Karlshamns TR": "TKA",
"Karlskoga TR": "TKA",
"Karlskoga tingsrätt": "TKA",
"Karlskrona TR": "TKA",
"Karlskrona tingsrätt": "TKA",
"Karlstads TR": "TKA",
"Karlstads tingsrätt": "TKA",
"Katrineholms TR": "TKA",
"Katrineholms tingsrätt": "TKA",
"Klippans TR": "TKL",
"Klippans tingsrätt": "TKL",
"Koncessionsnämnden för miljöskydd": "KFM",
"Kriminalvården": "KRV",
"Kristianstads TR": "TKR",
"Kristinehamns TR": "TKR",
"Kristinehamns tingsrätt": "TKR",
"Kyrkogårdsnämnden": "KGN",
"Kyrkogårdsstyrelsen": "KGS",
"Köpings TR": "TKÖ",
"Landskrona TR": "TLA",
"Landskrona tingsrätt": "TLA",
"Leksands TR": "TLE",
"Lidköpings TR": "TLI",
"Lidköpings tingsrätt": "TLI",
"Lindesbergs TR": "TLI",
"Linköpings TR": "TLI",
"Ljungby TR": "TLJ",
"Ljungby tingsrätt": "TLJ",
"Ludvika TR": "TLU",
"Ludvika tingsrätt": "TLU",
"Luleå TR": "TLU",
"Lunds TR": "TLU",
"Lycksele TR": "TLY",
"Läkemedelsverket": "LMV",
"Länsrätten i Blekinge län": "LBL",
"Länsrätten i Gotlands län": "LGO",
"Länsrätten i Gävleborgs län": "LGÄ",
"Länsrätten i Göteborg, migrationsdomstolen": "LGÖ",
"Länsrätten i Hallands län": "LHA",
"Länsrätten i Jönköpings län": "LJÖ",
"Länsrätten i Kalmar län": "LKA",
"Länsrätten i Kristianstads län": "LKR",
"Länsrätten i Norrbottens län": "LNO",
"Länsrätten i Skåne": "LSK",
"Länsrätten i Skåne län, migrationsdomstolen": "LSK",
"Länsrätten i Stockholm": "LST",
"Länsrätten i Stockholm län": "LST",
"Länsrätten i Stockholm, migrationsdomstolen": "LSTM",
"Länsrätten i Västernorrlands län": "LVÄ",
"Länsrätten i Örebro län": "LÖR",
"Länsrätten i Östergötlands län": "LÖS",
"Länsstyrelsen i Dalarnas län": "LSTD",
"Länsstyrelsen i Stockholms län": "LSTS",
"Mariestads TR": "TMA",
"Mjölby TR": "TMJ",
"Mora TR": "TMO",
"Motala TR": "TMO",
"Mölndals TR": "TMÖ",
"Mölndals tingsrätt": "TMÖ",
"Nacka TR": "TNA",
"Nacka tingsrätt, mark- och miljödomstolen": "TNAM",
"Nacka tingsrätt, miljödomstolen": "TNAM",
"Norrköpings TR": "TNO",
"Norrtälje tingsrätt": "TNO",
"Nyköpings TR": "TNY",
"Omsorgsnämnden i Trollhättans kommun": "OMS",
"Oskarshamns TR": "TOS",
"Oskarshamns tingsrätt": "TOS",
"Piteå TR": "TPI",
"Polismyndigheten": "POL",
"RTV": "RTV",
"Regeringen": "REG",
"Revisorsnämnden": "REV",
"Ronneby TR": "TRO",
"Ronneby tingsrätt": "TRO",
"Rättsskyddscentralen": "RSC",
"Sala TR": "TSA",
"Sandvikens TR": "TSA",
"Simrishamns TR": "TSI",
"Sjuhäradsbygdens TR": "TSJ",
"Sjuhäradsbygdens tingsrätt": "TSJ",
"Skattemyndigheten": "SKM",
"Skattemyndigheten i Luleå": "SKML",
"Skattverket": "SKV",
"Skellefteå TR": "TSK",
"Skellefteå tingsrätt": "TSK",
"Skövde tingsrätt": "TSK",
"Socialnämnden": "SON",
"Socialstyrelsen": "SOS",
"Sollefteå TR": "TSO",
"Sollentuna TR": "TSO",
"Sollentuna tingsrätt": "TSO",
"Solna TR": "TSO",
"Statens jordbruksverk": "SJV",
"Stenungsunds TR": "TST",
"Stenungsunds tingsrätt": "TST",
"Stockholm tingsrätt": "TST",
"Strömstads tingsrätt": "TST",
"Sundsvalls TR": "TSU",
"Sunne TR": "TSU",
"Sunne tingsrätt": "TSU",
"Svegs TR": "TSV",
"Södertälje TR": "TSÖ",
"Södra Roslags tingsrätt": "TSÖ",
"Sölvesborgs TR": "TSÖ",
"Tierps TR": "TTI",
"Tierps tingsrätt": "TTI",
"Trafiknämnden": "TRN",
"Transportstyrelsen": "TS",
"Trelleborgs TR": "TTR",
"Trelleborgs tingsrätt": "TTR",
"Trollhättans TR": "TTR",
"Tullverket": "TV",
"Uddevalla TR": "TUD",
"Umeå TR": "TUM",
"Umeå tingsrätt, mark- och miljödomstolen": "TUMM",
"Ungdomsstyrelsen": "US",
"Uppsala TR": "TUP",
"Varbergs TR": "TVA",
"Vattenöverdomstolen": "VÖD",
"Vänersborgs TR": "TVÄ",
"Vänersborgs tingsrätt, Miljödomstolen": "TVÄM",
"Vänersborgs tingsrätt, mark- och miljödomstolen": "TVÄM",
"Värnamo TR": "TVÄ",
"Värnamo tingsrätt": "TVÄ",
"Västerviks TR": "TVÄ",
"Västerås TR": "TVÄ",
"Västerås tingsrätt": "TVÄ",
"Växjö TR": "TVÄ",
"Växjö tingsrätt, mark- och miljödomstolen": "TVÄM",
"Växjö tingsrätt, miljödomstolen": "TVÄM",
"Ystads TR": "TYS",
"Ystads tingsrätt": "TYS",
"hovrätten för Västra Sverige": "HVS",
"kammarrätten i Göteborg": "KGO",
"länsrätten i Skåne län, migrationsdomstolen": "LSKM",
"länsstyrelsen": "LST",
"migrationsdomstolen": "MD",
"regeringen": "REG",
"skattemyndigheten": "SKM",
"Ängelholms TR": "TÄN",
"Åmåls TR": "TÅM",
"Örebro TR": "TÖR",
"Örnsköldsviks tingsrätt": "TÖR",
"Östersunds TR": "TÖS"
}
def construct_id(self, node, state):
if isinstance(node, Delmal):
state = dict(state)
node.uri = state['uri'] + "#" + node.ordinal
elif isinstance(node, Instans):
if node.court:
state = dict(state)
courtslug = self.courtslugs.get(node.court, "XXX")
if courtslug == "XXX":
self.log.warning("%s No slug defined for court %s" % (state["basefile"], node.court))
if "#" not in state['uri']:
state['uri'] += "#"
else:
state['uri'] += "/"
node.uri = state['uri'] + courtslug
else:
return state
elif isinstance(node, OrderedParagraph):
separator = "/" if "#" in state['uri'] else "#"
node.uri = state['uri'] + separator + "P" + node.ordinal
return state
elif isinstance(node, (Body, Dom, Domskal)):
return state
else:
return None
state['uri'] = node.uri
return state
def visitor_functions(self, basefile):
return ((self.construct_id, {'uri': self._canonical_uri,
'basefile': basefile}),
)
def facets(self):
# NOTE: it's important that RPUBL.rattsfallspublikation is the
# first facet (toc_pagesets depend on it)
def myselector(row, binding, resource_graph=None):
return (util.uri_leaf(row['rpubl_rattsfallspublikation']),
row['rpubl_arsutgava'])
# FIXME: This isn't used anymore -- when was it used and by which facet?
def mykey(row, binding, resource_graph=None):
if binding == "main":
# we'd really like
# rpubl:VagledandeDomstolsavgorande/rpubl:avgorandedatum,
# but that requires modifying facet_query
return row['update']
else:
return util.split_numalpha(row['dcterms_identifier'])
return [Facet(RPUBL.rattsfallspublikation,
indexingtype=fulltextindex.Resource(),
use_for_toc=True,
use_for_feed=True,
selector=myselector, # => ("ad","2001"), ("nja","1981")
key=Facet.resourcelabel,
identificator=Facet.defaultselector,
dimension_type='ref'),
Facet(RPUBL.referatrubrik,
indexingtype=fulltextindex.Text(boost=4),
toplevel_only=True,
use_for_toc=False),
Facet(DCTERMS.identifier,
use_for_toc=False),
Facet(RPUBL.arsutgava,
indexingtype=fulltextindex.Label(),
use_for_toc=False,
selector=Facet.defaultselector,
key=Facet.defaultselector,
dimension_type='value'),
Facet(RDF.type,
use_for_toc=False,
use_for_feed=True,
# dimension_label="main", # FIXME:
# dimension_label must be calculated as rdf_type
# or else the data from faceted_data() won't be
# usable by wsgi.stats
# key= # FIXME add useful key method for sorting docs
identificator=lambda x, y, z: None),
Facet(RPUBL.avgorandedatum, # we need this data when
# creating feeds, but not
# to sort/group by
use_for_toc=False,
use_for_feed=False)
] + self.standardfacets
def facet_query(self, context):
query = super(DV, self).facet_query(context)
# FIXME: This is really hacky, but the rpubl:avgorandedatum
# that we need is not a property of the root resource, but
# rather a linked resource. So we postprocess the query to get
# at that linked resource
return query.replace("?uri rpubl:avgorandedatum ?rpubl_avgorandedatum",
"?uri rpubl:referatAvDomstolsavgorande ?domuri . ?domuri rpubl:avgorandedatum ?rpubl_avgorandedatum")
def _relate_fulltext_resources(self, body):
res = []
uris = set()
for r in body.findall(".//*[@about]"):
if r.get("class") == "bodymeta":
continue
if r.get("about") not in uris:
uris.add(r.get("about"))
res.append(r)
return [body] + res
_relate_fulltext_value_cache = {}
def _relate_fulltext_value(self, facet, resource, desc):
def rootlabel(desc):
return desc.getvalue(DCTERMS.identifier)
if facet.dimension_label in ("label", "creator", "issued"):
# "creator" and "issued" should be identical for the root
# resource and all contained subresources. "label" can
# change slighly.
resourceuri = resource.get("about")
rooturi = resourceuri.split("#")[0]
if "#" not in resourceuri:
l = rootlabel(desc)
desc.about(desc.getrel(RPUBL.referatAvDomstolsavgorande))
self._relate_fulltext_value_cache[rooturi] = {
"creator": desc.getrel(DCTERMS.publisher),
"issued": desc.getvalue(RPUBL.avgorandedatum),
"label": l
}
desc.about(resourceuri)
v = self._relate_fulltext_value_cache[rooturi][facet.dimension_label]
if facet.dimension_label == "label" and "#" in resourceuri:
if desc.getvalues(DCTERMS.creator):
court = desc.getvalue(DCTERMS.creator)
else:
court = resource.get("about").split("#")[1]
# v = "%s (%s)" % (v, court)
v = court
return facet.dimension_label, v
else:
return super(DV, self)._relate_fulltext_value(facet, resource, desc)
def tabs(self):
return [("Vägledande rättsfall", self.dataset_uri())]