How to Recover Data Records with Vimscript

This post shows how to recover records from fixed-format data files without end of line characters to indicate the end of a record. The record length must be read somehow, preferably from a metadata file. In order to recover data records, the text must be split into multiple lines with a given record length.

As a data provider, I was asked to recover data records from a fixed-format data file without end of line characters. At that time, I chose Vimscript as a tool to accomplish this task. Vim can easily execute formatting tasks via scripting. However, string operations are costly and reduce speed, particularly when looping.

Thus, I decided to split the original data file into smaller files, split the lines in the smaller files into individual records, and finally append the individual records from the smaller files into a single output data file. I created two functions to do just that. These functions are called PostTypeFixed() and MakeLines().

PostTypeFixed() takes two arguments, the actual record length and an integer called factor. Factor is the number of smaller files created by the function. PostTypeFixed() calls MakeLines() on each of the smaller files. MakeLines() splits the lines into individual records. Thus, factor is used to increase the performance of MakeLines() by splitting very long lines into shorter lines.

" Split file and call function on part files
function! PostTypeFixed(rlen, factor)
	echo 'Increase factor for better performance'
	" Define temporary file name
	let s:bufName = tolower(expand("%:t:r")).'.tmp'
	" Define output file name
	let s:datOutFile = tolower(expand("%:t:r")).'.dat'
	" Find max record length
	let s:ll = max(map(range(1, line('$')), "col([v:val, '$'])")) - 1
	" Calculate the length of subset records
	let s:subsetRecords = ((s:ll/a:rlen)/a:factor)*a:rlen
	let lastRecord = (s:ll/a:rlen)*a:rlen

	" Make list with start/end record length for part files
	let splitStart = [0]
	let splitEnd = [s:subsetRecords]
	let splitFile = 1

	while splitFile < a:factor
		call extend(splitStart,[s:subsetRecords*splitFile])
		call extend(splitEnd,[s:subsetRecords*(splitFile+1)])
		let splitFile += 1
	endwhile
	" Update last record
	let splitEnd[a:factor-1] = lastRecord

	" Split file into part files
	let s:set = 0
	let linenum = 1

	while s:set < a:factor
		let newSet = []
		let subLine = strpart(getline(linenum), 
			\ splitStart[0+s:set], splitEnd[0+s:set]-splitStart[0+s:set])
		call add(newSet, subLine)
		call writefile(newSet, s:set+1.'_'.s:bufName)
		let s:set += 1
	endwhile

	" Call MakeLines function on part files
	let s:set = 0
	let linenum = 1

	while s:set < a:factor
		" Open part file
		let file = s:set+1.'_'.s:bufName | silent! execute 'e! '.file
		let status = s:set+1.'/'.a:factor
		echo 'Now processing part file: '.status
		call MakeLines(a:rlen)
		let s:set += 1
	endwhile

	" Open output
	silent! execute 'e! '.s:datOutFile
	" Validate output lines
	let val = line('$')
	let ok = (s:ll/a:rlen)/val
	if ok == 1
		echo 'Output lines ok: '.val.'/'.s:ll/a:rlen
	else
		echo 'NB! Error detected! Output line fail'
	endif
	echo 'PostTypeFixed Done'
endfunction

" Insert line feeds and append output to a file
function! MakeLines(rlen)
	" Make list to hold output lines
	let result = []
	" Calculate total records
	let totalRecords = max(map(range(1, line('$')), "col([v:val, '$'])")) - 1
	let totalRecords = totalRecords/a:rlen
	echo 'Records in part file: '.totalRecords

	" Split line into records
	let linenum = 1
	let recordStart = 0
	let recordEnd = a:rlen
	let lineCount = 0

	while lineCount < totalRecords
		let curr_line = strpart(getline(linenum), recordStart, a:rlen)
		call add(result, curr_line)
		let recordStart += a:rlen
		let recordEnd = recordStart+a:rlen
		" echo 'Record start/end: '.recordStart.'/'.recordEnd
		" echo 'Line: '.lineCount.'/'.totalRecords
		let lineCount += 1
	endwhile
	" Append list to file
	silent! call writefile(readfile(s:datOutFile)+result, s:datOutFile)
endfunction
Spread the love

Leave a Reply

Your email address will not be published.