Parsing Huge XML Files Incrementally
This post presents a Python script for parsing huge XML files incrementally. The purpose of the script is to convert XML tables to delimited text files. I searched online for inspiration while making the script and found relevant documentation and very useful posts with code examples. However, it took me a couple of days to develop the complete solution for this trivial task and this is why I have chosen to publish my script here. Alternatively, you can access the code on GitHub.
Use Case
The XML tables are generally huge, in the order of tens of gigabytes or more. Here is a very small example showing the XML content of a table with twelve columns and three rows:
<?xml version="1.0" encoding="UTF-8"?> <table xmlns="http://www.sa.dk/xmlns/siard/1.0/schema0/table.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.sa.dk/xmlns/siard/1.0/schema0/table.xsd table.xsd"> <row> <c1>1</c1> <c2>2</c2> <c3>1991-09-23T12:00:00.510000</c3> <c4 xsi:nil="true"/> <c5 xsi:nil="true"/> <c6 xsi:nil="true"/> <c7 xsi:nil="true"/> <c8>3</c8> <c9>A</c9> <c10>B</c10> <c11>C</c11> <c12>D</c12> </row> <row> <c1>4</c1> <c2>5</c2> <c3>1992-10-24T13:00:00.520000</c3> <c4 xsi:nil="true"/> <c5 xsi:nil="true"/> <c6 xsi:nil="true"/> <c7 xsi:nil="true"/> <c8>6</c8> <c9>E</c9> <c10>F</c10> <c11>G</c11> <c12>H</c12> </row> <row> <c1>7</c1> <c2>8</c2> <c3>1993-11-25T14:00:00.530000</c3> <c4 xsi:nil="true"/> <c5 xsi:nil="true"/> <c6 xsi:nil="true"/> <c7 xsi:nil="true"/> <c8>9</c8> <c9>I</c9> <c10>J</c10> <c11>K</c11> <c12>L</c12> </row> </table>
Parsing Huge XML Files Incrementally
Here is the code that performs incremental parsing of the XML table while converting the content to a delimited text file:
import argparse import csv import errno import os from typing import Dict, Callable, Any, Optional, List from lxml import etree class XMLParser(object): """ Incremental parsing of an XML file. Each element in the tag context is processed via a callable. A namespace map is automatically added to `callable_kwargs` if applicable. :param xml_file: XML file. :param python_callable: A function called for each element in the tag. :param callable_args: A list of positional arguments that will get unpacked in the callable. :param callable_kwargs: A dictionary of keyword arguments that will get unpacked in the callable. :param tag: Restrict elements to those elements that match the given tag, defaults to all elements. Namespaces must be declared in Clark's Notation: {URI}localname. :param dtd_validation: Validate the document against a DTD, defaults to False. :param schema: Validate the document against an XML schema (bytes version). """ def __init__(self, xml_file: str, python_callable: Callable[[etree.Element, Any], None], callable_args: Optional[List] = None, callable_kwargs: Optional[Dict] = None, tag: Optional[str] = None, dtd_validation: bool = False, schema: Optional[bytes] = None) -> None: if not callable(python_callable): raise TypeError('The `python_callable` parameter must be callable.') self.xml_file = xml_file self.python_callable = python_callable self.callable_args = callable_args or [] self.callable_kwargs = callable_kwargs or {} self.tag = tag self.dtd_validation = dtd_validation self.schema = etree.XMLSchema(etree.XML(schema)) if schema else None if self.is_non_empty_file(self.xml_file): xml_tree = etree.iterparse( self.xml_file, tag=self.tag, dtd_validation=self.dtd_validation, events=('start-ns', 'end'), # namespaces, element remove_blank_text=True, encoding='utf-8', schema=self.schema ) self.fast_iteration(xml_tree) # Iterate through parsed tag else: raise RuntimeError(f'{self.xml_file} is empty or non-existing.') def fast_iteration(self, xml_tree: etree.iterparse) -> None: """ A method to loop through a XML context, calling `python_callable` each time, and then clean up unneeded references. :param xml_tree: Return value from the iterparse API, tuple(event, element). """ namespaces = {} for event, element in xml_tree: if event == 'start-ns': # For 'start-ns' element is a tuple (prefix, URI) prefix, url = element if not prefix: prefix = 'ns' namespaces[prefix] = url # Store namespace in a dictionary (prefix: URI) elif event == 'end': # Process element if namespaces: self.callable_kwargs.update({'namespaces': namespaces}) self.python_callable(element, *self.callable_args, **self.callable_kwargs) element.clear() # Eliminate empty references from the root node to element for ancestor in element.xpath('ancestor-or-self::*'): while ancestor.getprevious() is not None: del ancestor.getparent()[0] del xml_tree @staticmethod def is_non_empty_file(file: str) -> bool: """ Return True if file is not empty. """ return os.path.isfile(file) and os.path.getsize(file) > 0 @staticmethod def delete_file(file: str) -> None: """ Delete file (which may not exist). Note: errno.ENOENT <=> no such file or directory. """ try: os.remove(file) print(f'File deleted: {file}.') except OSError as os_error: if os_error.errno != errno.ENOENT: print(f'{str(os_error)}.') def convert_to_csv(element: etree.Element, **kwargs) -> None: """ Write/append row to CSV file. """ row = [] csv_file = kwargs.get('csv_file') namespaces = kwargs.get('namespaces') print(f'c1: {element.xpath("ns:c1/text()", namespaces=namespaces)}') with open(csv_file, mode='a', encoding='utf-8') as file: writer = csv.writer(file, delimiter=';', quotechar='"', quoting=csv.QUOTE_MINIMAL) for column in element: row.append(column.text) writer.writerow(row) if __name__ == '__main__': schema_xml = None parser = argparse.ArgumentParser() parser.add_argument( '-t', '--tag', help="XML context", type=str) parser.add_argument( '-i', '--input', help='Path to XML file', type=str, required=True) parser.add_argument( '-o', '--output', help='Path to CSV file', type=str, required=True) parser.add_argument( '-s', '--schema', help='Path to XSD file', type=str) args = parser.parse_args() if XMLParser.is_non_empty_file(args.schema): with open(args.schema, mode='rb') as schema_file: schema_xml = schema_file.read() XMLParser.delete_file(args.output) print(f'Processing: {args.input}.') parser = XMLParser( xml_file=args.input, tag=args.tag, python_callable=convert_to_csv, callable_kwargs={'csv_file': args.output}, schema=schema_xml ) print('Done!')
Execute the script from shell:
python3 table_to_csv.py \ --tag="{http://www.sa.dk/xmlns/siard/1.0/schema0/table.xsd}row" \ --input="table.xml" \ --output="table.csv"
References
The script is based on input from the posts below:
Hi Arménio
I am sorry for the long delay! I assume you are running the script? If so it simply says that you are missing the input file as well as the output path that must be given as arguments after the script:
python3 table_to_csv.py \
–tag=”{http://www.sa.dk/xmlns/siard/1.0/schema0/table.xsd}row” \
–input=”table.xml” \
–output=”table.csv”
Hello I try to compile this code bit some error messages apear like this one: error: the following arguments are required: -i/–input, -o/–output, please what can I do?