Skip to content
Snippets Groups Projects
Commit a39f67ea authored by csteindl's avatar csteindl
Browse files

Add Jupyter notebook for MARCXML to MARC21 processing via OAI-PMH

parent feac28db
Branches
No related tags found
No related merge requests found
%% Cell type:code id: tags:
``` python
from sickle import Sickle
from lxml import etree
from typing import List
import pymarc
import os
ns = {
'marc': 'http://www.loc.gov/MARC21/slim',
'oai': 'http://www.openarchives.org/OAI/2.0/'
}
```
%% Cell type:code id: tags:
``` python
def save_records_as_marc(records: List[etree.Element], filename: str, tmp_filename:str = 'tmp.xml'):
"""Save a list of lxml representations of MARCXML records in MARC21 format to a file.
Parameters
----------
records : List[etree.Element]
The list of lxml records to process
filename : str
The location where to store the generated MARC21 file. Often ends with *.mrc suffix.
tmp_filename : str, optional
The function writes a temp file for processing. tmp_filename is the location for this file.
It will be removed at the end of the function.
"""
# temporarily create xml file to feed the pymarc parser
root = etree.Element('records')
{root.append(e) for e in records}
et = etree.ElementTree(root)
et.write(tmp_filename)
# convert elements to marc21
writer = pymarc.MARCWriter(open(filename, 'wb'))
pymarc.map_xml(writer.write, tmp_filename)
writer.close()
# remove temporary file
os.remove(tmp_filename)
```
%% Cell type:code id: tags:
``` python
sickle = Sickle('https://eu02.alma.exlibrisgroup.com/view/oai/43ACC_ONB/request', encoding='utf-8')
```
%% Cell type:code id: tags:
``` python
# get all records from set FULLMARC in marcxml format
records = sickle.ListRecords(metadataPrefix='marc21', set='FULLMARC', ignore_deleted=True)
counter = 0
# exit loop after processing certain amount of records
# set to 0 if all records should be processed
max_records = 0
# store results in batches
batch_size = 100
batch = []
for record in records:
counter += 1
record_xml = etree.fromstring(record.raw).find('.//marc:record', namespaces=ns)
batch.append(record_xml)
if counter % batch_size == 0:
save_records_as_marc(batch, f"batch_{int(counter/batch_size)}.mrc")
batch = []
if max_records > 0 and counter >= max_records:
break
# save last batch
save_records_as_marc(batch, f"batch_{int(counter/batch_size) + 1}.mrc")
```
%% Output
---------------------------------------------------------------------------
KeyboardInterrupt Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_11580/1222585073.py in <module>
11 batch = []
12
---> 13 for record in records:
14 counter += 1
15 record_xml = etree.fromstring(record.raw).find('.//marc:record', namespaces=ns)
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\sickle\iterator.py in __next__(self)
50
51 def __next__(self):
---> 52 return self.next()
53
54 def __repr__(self):
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\sickle\iterator.py in next(self)
149 return mapped
150 if self.resumption_token and self.resumption_token.token:
--> 151 self._next_response()
152 else:
153 raise StopIteration
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\sickle\iterator.py in _next_response(self)
136
137 def _next_response(self):
--> 138 super(OAIItemIterator, self)._next_response()
139 self._items = self.oai_response.xml.iterfind(
140 './/' + self.sickle.oai_namespace + self.element)
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\sickle\iterator.py in _next_response(self)
82 'verb': self.verb
83 }
---> 84 self.oai_response = self.sickle.harvest(**params)
85 error = self.oai_response.xml.find(
86 './/' + self.sickle.oai_namespace + 'error')
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\sickle\app.py in harvest(self, **kwargs)
119 :rtype: :class:`sickle.OAIResponse`
120 """
--> 121 http_response = self._request(kwargs)
122 for _ in range(self.max_retries):
123 if self._is_error_code(http_response.status_code) \
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\sickle\app.py in _request(self, kwargs)
135 def _request(self, kwargs):
136 if self.http_method == 'GET':
--> 137 return requests.get(self.endpoint, params=kwargs, **self.request_args)
138 return requests.post(self.endpoint, data=kwargs, **self.request_args)
139
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\api.py in get(url, params, **kwargs)
73 """
74
---> 75 return request('get', url, params=params, **kwargs)
76
77
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\api.py in request(method, url, **kwargs)
59 # cases, and look like a memory leak in others.
60 with sessions.Session() as session:
---> 61 return session.request(method=method, url=url, **kwargs)
62
63
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\sessions.py in request(self, method, url, params, data, headers, cookies, files, auth, timeout, allow_redirects, proxies, hooks, stream, verify, cert, json)
540 }
541 send_kwargs.update(settings)
--> 542 resp = self.send(prep, **send_kwargs)
543
544 return resp
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\sessions.py in send(self, request, **kwargs)
653
654 # Send the request
--> 655 r = adapter.send(request, **kwargs)
656
657 # Total elapsed time of the request (approximately)
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\requests\adapters.py in send(self, request, stream, timeout, verify, cert, proxies)
437 try:
438 if not chunked:
--> 439 resp = conn.urlopen(
440 method=request.method,
441 url=url,
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\connectionpool.py in urlopen(self, method, url, body, headers, retries, redirect, assert_same_host, timeout, pool_timeout, release_conn, chunked, body_pos, **response_kw)
697
698 # Make the request on the httplib connection object.
--> 699 httplib_response = self._make_request(
700 conn,
701 method,
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
443 # Python 3 (including for exceptions like SystemExit).
444 # Otherwise it looks like a bug in the code.
--> 445 six.raise_from(e, None)
446 except (SocketTimeout, BaseSSLError, SocketError) as e:
447 self._raise_timeout(err=e, url=url, timeout_value=read_timeout)
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\packages\six.py in raise_from(value, from_value)
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\site-packages\urllib3\connectionpool.py in _make_request(self, conn, method, url, timeout, chunked, **httplib_request_kw)
438 # Python 3
439 try:
--> 440 httplib_response = conn.getresponse()
441 except BaseException as e:
442 # Remove the TypeError from the exception chain in
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\http\client.py in getresponse(self)
1347 try:
1348 try:
-> 1349 response.begin()
1350 except ConnectionError:
1351 self.close()
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\http\client.py in begin(self)
314 # read until we get a non-100 response
315 while True:
--> 316 version, status, reason = self._read_status()
317 if status != CONTINUE:
318 break
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\http\client.py in _read_status(self)
275
276 def _read_status(self):
--> 277 line = str(self.fp.readline(_MAXLINE + 1), "iso-8859-1")
278 if len(line) > _MAXLINE:
279 raise LineTooLong("status line")
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\socket.py in readinto(self, b)
702 while True:
703 try:
--> 704 return self._sock.recv_into(b)
705 except timeout:
706 self._timeout_occurred = True
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\ssl.py in recv_into(self, buffer, nbytes, flags)
1239 "non-zero flags not allowed in calls to recv_into() on %s" %
1240 self.__class__)
-> 1241 return self.read(nbytes, buffer)
1242 else:
1243 return super().recv_into(buffer, nbytes, flags)
c:\Users\onbcst\AppData\Local\Programs\Python\Python39\lib\ssl.py in read(self, len, buffer)
1097 try:
1098 if buffer is not None:
-> 1099 return self._sslobj.read(len, buffer)
1100 else:
1101 return self._sslobj.read(len)
KeyboardInterrupt:
%% Cell type:code id: tags:
``` python
```
......@@ -33,4 +33,8 @@ the number of entries in one specific set.
Small script to harvest ABO metadata in Marc format via OAI-PMH.
## [MARCXML to MARC21 transformation](MARCXML to MARC21.ipynb)
Small script to harvest MARCXML records via OAI-PMH and to store it as raw MARC21. This step can help if you want to process the data e.g. via [MarcEdit](https://marcedit.reeset.net/).
[![Binder](https://mybinder.org/badge_logo.svg)](https://mybinder.org/v2/git/https%3A%2F%2Flabs.onb.ac.at%2Fgitlab%2Flabs-team%2FLOD/master)
\ No newline at end of file
......@@ -6,3 +6,4 @@ xlsxwriter
SPARQLWrapper
sickle
mysql-connector-python
pymarc
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment