Skip to content

Commit

Permalink
Partition physical-file -> logical-files in load()
Browse files Browse the repository at this point in the history
The dlis standard have a concept of logical files. A logical file is a
group of related logical records, i.e. curves and metadata and is
independent from any other logical file. Each physical file (.dlis) can
contain 1 to n logical files.

load() now returns a tuple-like with one file-handle pr logical file
in the physical file.
  • Loading branch information
ErlendHaa authored and jokva committed Jun 2, 2019
1 parent 0bef0d6 commit 3207f16
Show file tree
Hide file tree
Showing 6 changed files with 323 additions and 48 deletions.
185 changes: 166 additions & 19 deletions python/dlisio/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,12 @@
pass

class dlis(object):
def __init__(self, stream, explicits, sul_offset = 80):
def __init__(self, stream, explicits, attic, implicits, sul_offset = 80):
self.file = stream
self.explicit_indices = explicits
self.attic = None
self.attic = attic
self.sul_offset = sul_offset
self.fdata_index = None
self.fdata_index = implicits

self.objects = {}
self.object_sets = defaultdict(dict)
Expand Down Expand Up @@ -45,6 +45,10 @@ def __enter__(self):
return self

def __exit__(self, type, value, traceback):
print("dlis: closed {}".format(self))
self.close()

def close(self):
self.file.close()

def storage_label(self):
Expand Down Expand Up @@ -308,15 +312,63 @@ def open(path):
return core.stream(str(path))

def load(path):
""" Load a file
""" Loads a file and returns one filehandle pr logical file.
The dlis standard have a concept of logical files. A logical file is a
group of related logical records, i.e. curves and metadata and is
independent from any other logical file. Each physical file (.dlis) can
contain 1 to n logical files. Layouts of physical- and logical files:
Physical file::
--------------------------------------------------------
| Logical File 1 | Logical File 2 | ... | Logical File n |
--------------------------------------------------------
Logical File::
---------------------------------------------------------
| Fileheader | Origin | Frame | Channel | curvedata |
---------------------------------------------------------
This means that dlisio.load() will return 1 to n logical files.
Parameters
----------
path : str_like
Examples
--------
Read the fileheader of each logical file
>>> with dlisio.load(filename) as files:
... for f in files:
... header = f.fileheader
Automatically unpack the first logical file and store the remaining logical
files in tail
>>> with dlisio.load(filename) as (f, *tail):
... header = f.fileheader
... for g in tail:
... header = g.fileheader
Notes
-----
1) That the parentezies are needed when unpacking directly in the with
statment
2) The asterisk allows an arbitrary number of extra logical files to be
stored in tail. Use len(tail) to check how many extra logical files there
is
Returns
-------
dlis : dlisio.dlis
dlis : tuple(dlisio.dlis)
"""
path = str(path)

Expand All @@ -327,25 +379,120 @@ def load(path):
vrlpos = core.findvrl(mmap, sulpos + 80)

tells, residuals, explicits = core.findoffsets(mmap, vrlpos)
explicits = [i for i, explicit in enumerate(explicits) if explicit != 0]

stream = open(path)
exi = [i for i, explicit in enumerate(explicits) if explicit != 0]

try:
stream = open(path)
stream.reindex(tells, residuals)
f = dlis(stream, explicits, sul_offset = sulpos)

explicits = set(explicits)
candidates = [x for x in range(len(tells)) if x not in explicits]

# TODO: formalise and improve the indexing of FDATA records
index = defaultdict(list)
for key, val in core.findfdata(mmap, candidates, tells, residuals):
index[key].append(val)

f.fdata_index = index
records = stream.extract(exi)
stream.close()
except:
stream.close()
raise

return f
split_at = find_fileheaders(records, exi)

batch = []
for part in partition(records, explicits, tells, residuals, split_at):
try:
stream = open(path)
stream.reindex(part['tells'], part['residuals'])

implicits = defaultdict(list)
for key, val in core.findfdata(mmap,
part['implicits'], part['tells'], part['residuals']):
implicits[key].append(val)

f = dlis(stream, part['explicits'],
part['records'], implicits, sul_offset=sulpos)
batch.append(f)
except:
stream.close()
for stream in batch:
stream.close()
raise

return Batch(batch)

class Batch(tuple):
def __enter__(self):
return self

def __exit__(self, type, value, traceback):
self.close()

def close(self):
for f in self:
f.close()

def find_fileheaders(records, exi):
# Logical files start whenever a FILE-HEADER is encountered. When a logical
# file spans multiple physical files, the FILE-HEADER is not repeated.
# This means that the first record may not be a FILE-HEADER. In that case
# dlisio still creates a logical file, but warns that this logical file
# might be segmented, hence missing data.
msg = 'First logical file does not contain a fileheader. '
msg += 'The logical file might be segmented into multiple physical files '
msg += 'and data can be missing.'

pivots = []

# There is only indirectly formated logical records in the physical file.
# The logical file might be segmented.
if not records:
pivots.append(0)

for i, rec in enumerate(records):
# The first metadata record is not a file-header. The logical file
# might be segmented.
#TODO: This logic will change when support for multiple physical files
# in a storage set is added
if i == 0 and rec.type != 0:
logging.warning(msg)
pivots.append(exi[i])

if rec.type == 0:
pivots.append(exi[i])

return pivots

def partition(records, explicits, tells, residuals, pivots):
"""
Splits records, explicits, implicits, tells and residuals into
partitions (Logical Files) based on the pivots
Returns
-------
partitions : list(dict)
"""

def split_at(lst, pivot):
head = lst[:pivot]
tail = lst[pivot:]
return head, tail

partitions = []

for pivot in reversed(pivots):
tells , part_tells = split_at(tells, pivot)
residuals, part_res = split_at(residuals, pivot)
explicits, part_ex = split_at(explicits, pivot)

part_ex = [i for i, x in enumerate(part_ex) if x != 0]
implicits = [x for x in range(len(part_tells)) if x not in part_ex]

records, part_recs = split_at(records, -len(part_ex))

part = {
'records' : part_recs,
'explicits' : part_ex,
'tells' : part_tells,
'residuals' : part_res,
'implicits' : implicits
}
partitions.append(part)

for par in reversed(partitions):
yield par
4 changes: 2 additions & 2 deletions python/tests/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

@pytest.fixture(scope="module", name="DWL206")
def DWL206():
with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as f:
with dlisio.load('data/206_05a-_3_DWL_DWL_WIRE_258276498.DLIS') as (f,):
yield f

@pytest.fixture(scope="module", name="only_channels")
def only_channels():
with dlisio.load('data/only-channels.dlis') as f:
with dlisio.load('data/only-channels.dlis') as (f,):
yield f

@pytest.fixture(scope="module")
Expand Down
6 changes: 3 additions & 3 deletions python/tests/test_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -427,17 +427,17 @@ def test_dtype(DWL206):
('DEPT_SL', np.float32)])

def test_load_pre_sul_garbage(only_channels):
with dlisio.load('data/pre-sul-garbage.dlis') as f:
with dlisio.load('data/pre-sul-garbage.dlis') as (f,):
assert f.storage_label() == f.storage_label()
assert f.sul_offset == 12

def test_load_pre_vrl_garbage(only_channels):
with dlisio.load('data/pre-sul-pre-vrl-garbage.dlis') as f:
with dlisio.load('data/pre-sul-pre-vrl-garbage.dlis') as (f,):
assert f.storage_label() == f.storage_label()
assert f.sul_offset == 12

def test_load_file_with_broken_utf8():
with dlisio.load('data/broken-degree-symbol.dlis') as f:
with dlisio.load('data/broken-degree-symbol.dlis') as (f, *tail):
pass

def test_padbytes_as_large_as_record():
Expand Down
128 changes: 128 additions & 0 deletions python/tests/test_load.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
import pytest

import dlisio

from . import merge_files

@pytest.fixture(scope="module")
def fpath(tmpdir_factory, merge_files):
path = str(tmpdir_factory.mktemp('semantic').join('manylogfiles.dlis'))
content = [
'data/semantic/envelope.dlis.part',
# First logical file, does not have FILE-HEADER
'data/semantic/origin.dlis.part',
'data/semantic/channel.dlis.part',
'data/semantic/frame.dlis.part',
# Second logical file
'data/semantic/file-header.dlis.part',
'data/semantic/origin2.dlis.part',
'data/semantic/channel-reprcode.dlis.part',
'data/semantic/frame-reprcode.dlis.part',
'data/semantic/fdata-reprcode.dlis.part',
'data/semantic/frame.dlis.part',
'data/semantic/fdata-reprcode.dlis.part',
# Third logical file, only has a FILE-HEADER
'data/semantic/file-header2.dlis.part',
]
merge_files(path, content)
return path

def test_context_manager(fpath):
f, *_ = dlisio.load(fpath)
_ = f.fileheader
f.close()

files = dlisio.load(fpath)
for f in files:
_ = f.fileheader
f.close()

f, *files = dlisio.load(fpath)
_ = f.fileheader
for g in files:
_ = g.fileheader
g.close()

def test_context_manager_with(fpath):
with dlisio.load(fpath) as (f, *_):
_ = f.fileheader

with dlisio.load(fpath) as files:
for f in files:
_ = f.fileheader

with dlisio.load(fpath) as (f, *files):
_ = f.fileheader
for g in files:
_ = g.fileheader

def test_partitioning(fpath):
with dlisio.load(fpath) as (f1, f2, f3, *tail):
assert len(tail) == 0

assert len(f1.objects) == 8
assert len(f2.objects) == 32
assert len(f3.objects) == 1

key = dlisio.core.fingerprint('FRAME', 'FRAME-REPRCODE', 10, 0)

assert f1.explicit_indices == [0, 1, 2]
assert not f1.fdata_index

assert f2.explicit_indices == [0, 1, 2, 3, 5]
assert f2.fdata_index[key] == [4, 6]

assert f3.explicit_indices == [0]
assert not f3.fdata_index

def test_objects(fpath):
with dlisio.load(fpath) as (f1, f2, f3):
key = dlisio.core.fingerprint('FILE-HEADER', 'N', 10, 0)
fh2 = f2.objects[key]
key = dlisio.core.fingerprint('FILE-HEADER', 'N', 11, 0)
fh3 = f3.objects[key]

assert len(f1.fileheader) == 0
assert len(f1.origin) == 2
assert len(f1.channels) == 4
assert len(f1.frames) == 2

assert fh2.sequencenr == '8'
assert fh2.id == 'some logical file'
assert fh2 not in f3.fileheader

assert len(f2.origin) == 1
assert len(f2.channels) == 27
assert len(f2.frames) == 3

assert fh3.sequencenr == '10'
assert fh3.id == 'Yet another logical file'
assert fh3 not in f2.fileheader

def test_link(fpath):
with dlisio.load(fpath) as (f1, f2, _):
key = dlisio.core.fingerprint('FRAME', 'FRAME1', 10, 0)
frame1 = f1.objects[key]
frame2 = f2.objects[key]

key = dlisio.core.fingerprint('CHANNEL', 'CHANN1', 10, 0)
channel = f1.objects[key]

# The same frame is present in two different logical files. The
# channels in frame.channel are only present in the first
# logical file. Thus links are not available in the second file.
assert channel in frame1.channels
assert channel not in frame2.channels

def test_curves(fpath):
with dlisio.load(fpath) as (_, f2, _):
key = dlisio.core.fingerprint('FRAME', 'FRAME-REPRCODE', 10, 0)
curves = f2.curves(key)

# Read the first value of the first frame of channel CH01
assert curves['CH01'][0][0] == 153.0
assert curves[0]['CH01'][0] == 153.0

# Read the first value of the second frame of channel CH01
assert curves['CH01'][1][0] == 153.0
assert curves[1]['CH01'][0] == 153.0
Loading

0 comments on commit 3207f16

Please sign in to comment.