2 Routines to parse file metadata.
15 Returns the header content from an HDF 4 file, which is obtained via
19 hdp = os.path.join(os.getenv(
'LIB3_BIN'),
'hdp')
20 if not (os.path.isfile(hdp)
and os.access(hdp, os.X_OK)):
21 print(hdp,
"is not executable.")
25 cmd = [hdp,
'dumpsds',
'-h',
'-s', filename]
26 hdp_data = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=
False).stdout
27 contents = hdp_data.read().decode(
"utf-8")
32 Returns the header content plain text from an HDF 5 file which is obtained via
35 h5dump = os.path.join(os.getenv(
'LIB3_BIN'),
'h5dump')
36 if not (os.path.isfile(h5dump)
and os.access(h5dump, os.X_OK)):
37 print(h5dump,
"is not executable.")
39 cmd = [h5dump,
'-H', filename]
40 h5dump_output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
41 stderr=subprocess.PIPE, shell=
False).stdout
42 content = h5dump_output.read().decode(
"utf-8")
43 if content.find(
'HDF') != -1:
50 Returns the header content as XML from an HDF 5 file which is obtained via
53 h5dump = os.path.join(os.getenv(
'LIB3_BIN'),
'h5dump')
54 if not (os.path.isfile(h5dump)
and os.access(h5dump, os.X_OK)):
55 print(h5dump,
"is not executable.")
59 cmd = [h5dump,
'-A',
'-u', filename]
60 h5dump_output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
61 stderr=subprocess.PIPE, shell=
False).stdout
62 content = h5dump_output.read().decode(
"utf-8")
63 if content.find(
'HDF') != -1:
70 Returns the mime data for the file named in filename as found by running
73 mimecmd = [
'file',
'--brief', filename]
74 mime_data = subprocess.Popen(mimecmd,
75 stdout=subprocess.PIPE, shell=
False).communicate()[0]
76 return mime_data.decode(
"utf-8")
80 Returns True if the given file is an ASCII file, False otherwise.
82 file_cmd_path = os.path.join(os.sep,
'usr',
'bin',
'file')
83 if os.path.exists(file_cmd_path)
and os.access(file_cmd_path, os.X_OK):
84 file_cmd = [file_cmd_path,
'--brief', filename]
85 file_output = subprocess.Popen(file_cmd, shell=
False,
86 stdout=subprocess.PIPE).stdout
87 file_type = file_output.read().decode(
"utf-8").strip()
88 if file_type.find(
'ASCII') != -1:
93 err_msg =
'Error! Unable to run the file command.'
98 Return True when the mime data is from netCDF4/HDF 5 file.
100 return re.search(
'Hierarchical.*version.4', mime_data)
104 Return True when the mime data is from netCDF4/HDF 5 file.
106 return re.search(
'Hierarchical.*version.5', mime_data)
110 This function is deprecated. Using it is discouraged. Please call
111 tarfile.is_tarfile directly.
113 Returns a boolean telling if the file is a tar archive file.
122 return tarfile.is_tarfile(file_path)
126 Return True when the mime data is from xml, Landsat L1 file or MSI L1C file.
128 return re.search(
'xml', mime_data)
or re.search(
'LC08_L1', mime_data)
or re.search(
'manifest.safe', mime_data)
131 """Dump file metadata:
132 Call functions to get HDF 4 and HDF 5 header data
133 read ASCII header from MERIS N1 files
137 if not os.path.isfile(filename):
138 print(
"Can't find input file '" + filename +
"'.")
141 lib3_bin_dir = os.getenv(
'LIB3_BIN')
143 sys.exit(
'Error! Unable to locate LIB3_BIN environment variable. You may need to run')
144 ncdump = os.path.join(lib3_bin_dir,
'ncdump')
145 ncdump_hdf = os.path.join(lib3_bin_dir,
'ncdump_hdf')
151 if mime.strip() ==
'data':
156 if re.search(
'Hierarchical.*version.4', mime):
159 elif re.search(
'Hierarchical.*version.5', mime):
162 elif re.search(
'NetCDF Data Format', mime):
163 if not (os.path.isfile(ncdump_hdf)
and os.access(ncdump_hdf, os.X_OK)):
164 print(ncdump_hdf,
"is not executable.")
166 cmd = [ncdump_hdf,
'-h', filename]
167 hdr_content = subprocess.Popen(cmd, shell=
False,
168 stdout=subprocess.PIPE).communicate()
169 return hdr_content[0].decode(
"utf-8").split(
'\n')
171 fbuffer = open(filename,
'r', 1)
173 line1 = fbuffer.readline()
176 if re.search(
"HDF_UserBlock", line1):
179 elif line1[0:3] ==
'CDF':
181 cmd = [ncdump,
'-h', filename]
182 hdr_content = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=
False).stdout
183 return hdr_content.read().decode(
"utf-8")
186 fbuffer = open(filename,
'r', 100)
188 line = fbuffer.readline()
193 if re.search(
'LAST_LAST_LONG', line):
195 line = fbuffer.readline()
198 except UnicodeDecodeError:
203 Returns a dictionary containing the metadata for the file named by filename.
210 if text
is None or text ==
'':
211 sys.exit(
"Error! dump_metadata failed.")
216 if isinstance(text, list):
219 elif re.search(
'SENTINEL-2 MSI Level-1C User Product', text[0]):
221 if re.search(
'2A', text[0]):
222 attrs[
'platform'] =
'S2A'
223 elif re.search(
'2B', text[0]):
224 attrs[
'platform'] =
'S2B'
225 if text[0].find(
'startTime') != -1:
226 line_parts = text[0].split(
'safe:startTime>')
227 line_parts2 = line_parts[1].split(
'Z<')
228 attrs[
'startTime'] = line_parts2[0].strip()
229 attrs[
'instrument'] =
'MSI'
230 attrs[
'processing_level'] =
'L1B'
232 elif re.search(
'PRODUCT', text[0]):
235 (key, value) =
str(line).split(
'=')
236 attrs[key] =
str(value).strip(
'"')
238 elif text[0][0:4] ==
'CWIF':
239 return {
'Title':
'SeaWiFS Level-0'}
240 elif text[0].find(
'GROUP = L1_METADATA_FILE') != -1:
241 in_metadata_group =
False
244 if in_metadata_group:
245 if line.find(
'END_GROUP = PRODUCT_METADATA') != -1:
248 line_parts = line.split(
'=')
249 attr_key = line_parts[0].strip()
250 attr_val = line_parts[1].strip()
251 attrs[attr_key] = attr_val
252 elif line.find(
'GROUP = PRODUCT_METADATA') != -1:
253 in_metadata_group =
True
254 elif text[0].find(
'GROUP = LANDSAT_METADATA_FILE') != -1:
255 in_metadata_group =
False
258 if in_metadata_group:
259 if line.find(
'END_GROUP = LEVEL1_PROCESSING_RECORD') != -1:
262 line_parts = line.split(
'=')
263 attr_key = line_parts[0].strip()
264 attr_val = line_parts[1].strip()
265 attrs[attr_key] = attr_val
266 elif line.find(
'GROUP = PRODUCT_CONTENTS') != -1:
267 in_metadata_group =
True
268 elif text[0].find(
' = INVENTORYMETADATA') != -1:
270 in_sensor_group =
False
271 in_date_group =
False
272 in_time_group =
False
275 if in_sensor_group
and line.find(
'VALUE') != -1:
276 line_parts = line.split(
'=')
277 attr_val = line_parts[1].strip().replace(
'"',
'')
278 attrs[
'instrument'] = attr_val
280 elif in_date_group
and line.find(
'VALUE') != -1:
281 line_parts = line.split(
'=')
282 attr_val = line_parts[1].strip().replace(
'"',
'')
283 attrs[
'startDate'] = attr_val
284 in_date_group =
False
285 elif in_time_group
and line.find(
'VALUE') != -1:
286 line_parts = line.split(
'=')
287 attr_val = line_parts[1].strip().replace(
'"',
'')
288 attrs[
'startTime'] = attr_val
289 in_time_group =
False
290 elif line.find(
'SENSORSHORTNAME') != -1
and line.find(
'END_OBJECT') == -1:
291 in_sensor_group =
True
292 elif line.find(
'TIMEOFDAY') != -1
and line.find(
'END_OBJECT') == -1:
294 elif line.find(
'CALENDARDATE') != -1
and line.find(
'END_OBJECT') == -1:
296 elif text[0].find(
'xml') != -1:
299 if line.find(
'SENTINEL-2 MSI Level-1C User Product') != -1:
300 attrs[
'instrument'] =
'MSI'
301 attrs[
'processing_level'] =
'L1B'
302 if line.find(
'safe:startTime>') != -1:
303 line_parts = line.split(
'safe:startTime>')
304 line_parts2 = line_parts[1].split(
'<')
305 attrs[
'startTime'] = line_parts2[0].strip()
306 if line.find(
'stopTime') != -1:
307 line_parts = line.split(
'>')
308 line_parts2 = line_parts[1].split(
'<')
309 attrs[
'stopTime'] = line_parts2[0].strip()
310 if line.find(
'<envisat:productName>ENV_ME_1_') != -1:
311 attrs[
'platform'] =
'ENVISAT'
312 attrs[
'instrument'] =
'MERIS'
313 attrs[
'processing_level'] =
'L1B'
315 if line.find(
'<sentinel3:productName>S3A_OL_1_ERR') != -1:
316 attrs[
'platform'] =
'3A'
317 attrs[
'data_type'] =
'ERR'
318 attrs[
'instrument'] =
'OLCI'
319 attrs[
'processing_level'] =
'L1B'
321 if line.find(
'<sentinel3:productName>S3A_OL_1_EFR') != -1:
322 attrs[
'platform'] =
'3A'
323 attrs[
'data_type'] =
'EFR'
324 attrs[
'instrument'] =
'OLCI'
325 attrs[
'processing_level'] =
'L1B'
327 if line.find(
'<sentinel3:productName>S3B_OL_1_ERR') != -1:
328 attrs[
'platform'] =
'3B'
329 attrs[
'data_type'] =
'ERR'
330 attrs[
'instrument'] =
'OLCI'
331 attrs[
'processing_level'] =
'L1B'
333 if line.find(
'<sentinel3:productName>S3B_OL_1_EFR') != -1:
334 attrs[
'platform'] =
'3B'
335 attrs[
'data_type'] =
'EFR'
336 attrs[
'instrument'] =
'OLCI'
337 attrs[
'processing_level'] =
'L1B'
339 if line.find(
'2A')!= -1
and attrs[
'instrument'] ==
'MSI':
340 attrs[
'platform'] =
'S2A'
342 if line.find(
'2B')!= -1
and attrs[
'instrument'] ==
'MSI':
343 attrs[
'platform'] =
'S2B'
347 if line.find(
'title = ') != -1:
348 if line.find(
'Daily-OI') != -1:
350 return {
'Title':
'Ancillary',
'Data Type':
'SST'}
351 elif isinstance(text, bytes)
and (text[0:6] ==
'netcdf'):
353 lines = text.split(
'\n')
355 if line.find(
'=') != -1:
356 fields = line.split(
'=')
359 while (
not fields[0][pos].
isalpha())
and pos < len(fields[0]):
362 attrs[key.strip()] = fields[1].strip()
364 elif isinstance(text, bytes)
and (text[0:4] ==
'HDF5'):
369 elif re.search(
r'<\?xml', text)
or (text[0:4] ==
'HDF5'):
374 file_attr_re = re.compile(
'File attributes:(.+?)\n',
375 re.MULTILINE | re.DOTALL)
376 file_attr_results = file_attr_re.search(text)
377 if file_attr_results !=
None:
378 file_attr_var_re = re.compile(
'File attributes:(.+?)\nVariable',
379 re.MULTILINE | re.DOTALL)
380 file_attr_var_results = file_attr_var_re.search(text)
381 if file_attr_var_results !=
None:
382 allmeta = file_attr_var_results.group(1)
384 allmeta = re.sub(
r'\s*=\s*',
'=', allmeta)
394 :param text: Text containing metadata to be parsed.
395 :return: A dictionary containing metadata attributes.
398 lines = text.split(
'\n')
399 attr_pattern = re.compile(
r'^\s*Attr\d+: Name = ')
400 value_pattern = re.compile(
r'^\s*Value = ')
403 if re.match(attr_pattern, line):
405 attr_name = line.split(
'=')[1].strip()
407 if re.match(value_pattern, line):
408 val =
str(line).split(
'=', 1)[1].strip()
409 if attr_name ==
'Input Parameters':
410 attrs[attr_name] = {}
411 params = val.split(
'|')
413 parts = param.split(
'=')
415 attrs[attr_name][parts[0].strip()] = parts[1].strip()
417 attrs[attr_name] = val
421 """ Returns a Python dictionary containing the file metadata passed from
422 header_text. The dictionary keys will the attribute names and the values
423 will be the data values for the attributes. """
425 attr_regex = re.compile(
r'ATTRIBUTE "')
426 data_item_regex = re.compile(
r'\(\d+(,\d+)?\): ".+"')
427 data_open_regex = re.compile(
r'DATA \{')
428 close_regex = re.compile(
r' \}')
429 data_lines = header_text.split(
'\n')
432 for line
in data_lines:
433 if attr_regex.search(line):
435 attr_name = re.search(
r'ATTRIBUTE "(.+)"', line).
group(1)
436 attributes[attr_name] =
''
437 elif data_open_regex.search(line):
440 if close_regex.search(line):
443 elif re.search(
r'\(\d+\)\:', line):
448 the_data = line.split(
':')[1].strip().strip(
'"').strip()
449 attributes[attr_name] = the_data
450 elif in_attr
and close_regex.search(line):
456 get interesting bits from ODL formatted metadata
459 pattern =
r'^\s*Attr\d+: Name=(.+?)\s*Type=(.+?)\s*Count=(.+?)\s*Value=(.+?)$'
460 re_attr = re.compile(pattern, re.MULTILINE | re.DOTALL)
462 for att
in re_attr.finditer(metatext):
463 name, dtype, count, value = att.groups()
467 value = re.sub(
r'\\000',
'', value)
468 value = re.sub(
r'\\011',
'\t', value)
469 value = re.sub(
r'\\012',
'\n', value)
474 value =
','.join(value.split())
478 if 'Metadata.' in name:
491 add xml attributes to attr and decend groups
494 if node.tag ==
'Attribute':
496 key = node.attrib[
'Name']
497 val = node.find(
'Data').find(
'DataFromFile').text.strip().strip(
'"')
501 elif node.tag ==
'Group' or node.tag ==
'Dataset':
507 parse xml formatted metadata
509 import xml.etree.ElementTree
as ET
512 root = ET.fromstring(metaxml).find(
'RootGroup')
517 """Recursively extract ODL groups and objects."""
520 pattern =
r"(GROUP|OBJECT)=(.+?)$(.+?)END_\1=\2"
521 re_odl = re.compile(pattern, re.MULTILINE | re.DOTALL)
523 blocks = re_odl.findall(text)
530 if not len(
list(items.keys())):
531 for line
in text.splitlines():
537 """Interpret text as key/value pairs, if possible."""
541 key, value = [i.strip()
for i
in text.split(
'=', 1)]
549 """Parse string value into correct type"""
552 except (NameError, SyntaxError, TypeError):
557 du.delete_key(metadict,
'StructMetadata.[0-9]')
558 du.delete_key(metadict,
'(NUM_VAL|CLASS)')
559 du.promote_value(metadict,
'.*VALUE')
560 du.reassign_keys_in_dict(metadict,
561 'ADDITIONALATTRIBUTENAME',
'INFORMATIONCONTENT')
562 du.flatten_dict(metadict)