OB.DAAC Logo
NASA Logo
Ocean Color Science Software

ocssw V2022
MetaUtils.py
Go to the documentation of this file.
1 """
2 Routines to parse file metadata.
3 
4 """
5 import tarfile
6 
7 import seadasutils.DictUtils as du
8 import os
9 import re
10 import subprocess
11 import sys
12 
13 def get_hdf4_content(filename):
14  """
15  Returns the header content from an HDF 4 file, which is obtained via
16  'hdp dumpsds'.
17  """
18  # does executable exist?
19  hdp = os.path.join(os.getenv('LIB3_BIN'), 'hdp')
20  if not (os.path.isfile(hdp) and os.access(hdp, os.X_OK)):
21  print(hdp, "is not executable.")
22  return None
23 
24  # dump file header
25  cmd = [hdp, 'dumpsds', '-h', '-s', filename]
26  hdp_data = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=False).stdout
27  contents = hdp_data.read().decode("utf-8")
28  return contents
29 
31  """
32  Returns the header content plain text from an HDF 5 file which is obtained via
33  'h5dump -H'.
34  """
35  h5dump = os.path.join(os.getenv('LIB3_BIN'), 'h5dump')
36  if not (os.path.isfile(h5dump) and os.access(h5dump, os.X_OK)):
37  print(h5dump, "is not executable.")
38  return None
39  cmd = [h5dump, '-H', filename]
40  h5dump_output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
41  stderr=subprocess.PIPE, shell=False).stdout
42  content = h5dump_output.read().decode("utf-8")
43  if content.find('HDF') != -1:
44  return content
45  else:
46  return None
47 
48 def get_hdf5_header_xml(filename):
49  """
50  Returns the header content as XML from an HDF 5 file which is obtained via
51  'h5dump -Au'.
52  """
53  h5dump = os.path.join(os.getenv('LIB3_BIN'), 'h5dump')
54  if not (os.path.isfile(h5dump) and os.access(h5dump, os.X_OK)):
55  print(h5dump, "is not executable.")
56  return None
57 
58  # dump file header
59  cmd = [h5dump, '-A', '-u', filename]
60  h5dump_output = subprocess.Popen(cmd, stdout=subprocess.PIPE,
61  stderr=subprocess.PIPE, shell=False).stdout
62  content = h5dump_output.read().decode("utf-8")
63  if content.find('HDF') != -1:
64  return content
65  else:
66  return None
67 
68 def get_mime_data(filename):
69  """
70  Returns the mime data for the file named in filename as found by running
71  the file command
72  """
73  mimecmd = ['file', '--brief', filename]
74  mime_data = subprocess.Popen(mimecmd,
75  stdout=subprocess.PIPE, shell=False).communicate()[0]
76  return mime_data.decode("utf-8")
77 
78 def is_ascii_file(filename):
79  """
80  Returns True if the given file is an ASCII file, False otherwise.
81  """
82  file_cmd_path = os.path.join(os.sep, 'usr', 'bin', 'file')
83  if os.path.exists(file_cmd_path) and os.access(file_cmd_path, os.X_OK):
84  file_cmd = [file_cmd_path, '--brief', filename]
85  file_output = subprocess.Popen(file_cmd, shell=False,
86  stdout=subprocess.PIPE).stdout
87  file_type = file_output.read().decode("utf-8").strip()
88  if file_type.find('ASCII') != -1:
89  return True
90  else:
91  return False
92  else:
93  err_msg = 'Error! Unable to run the file command.'
94  sys.exit(err_msg)
95 
96 def is_hdf4(mime_data):
97  """
98  Return True when the mime data is from netCDF4/HDF 5 file.
99  """
100  return re.search('Hierarchical.*version.4', mime_data)
101 
102 def is_netcdf4(mime_data):
103  """
104  Return True when the mime data is from netCDF4/HDF 5 file.
105  """
106  return re.search('Hierarchical.*version.5', mime_data)
107 
108 def is_tar_file(file_path):
109  """
110  This function is deprecated. Using it is discouraged. Please call
111  tarfile.is_tarfile directly.
112 
113  Returns a boolean telling if the file is a tar archive file.
114  """
115  # is_tar = False
116  # try:
117  # test_tar_obj = tarfile.TarFile(file_path)
118  # is_tar = True
119  # test_tar_obj.close()
120  # except:
121  # pass
122  return tarfile.is_tarfile(file_path)
123 
124 def is_metadata_file(mime_data):
125  """
126  Return True when the mime data is from xml, Landsat L1 file or MSI L1C file.
127  """
128  return re.search('xml', mime_data) or re.search('LC08_L1', mime_data) or re.search('manifest.safe', mime_data)
129 
130 def dump_metadata(filename):
131  """Dump file metadata:
132  Call functions to get HDF 4 and HDF 5 header data
133  read ASCII header from MERIS N1 files
134  """
135 
136  # does input file exist?
137  if not os.path.isfile(filename):
138  print("Can't find input file '" + filename + "'.")
139  return None
140 
141  lib3_bin_dir = os.getenv('LIB3_BIN')
142  if not lib3_bin_dir:
143  sys.exit('Error! Unable to locate LIB3_BIN environment variable. You may need to run')
144  ncdump = os.path.join(lib3_bin_dir, 'ncdump')
145  ncdump_hdf = os.path.join(lib3_bin_dir, 'ncdump_hdf')
146 
147  # mimecmd = ['file', '--brief', filename]
148  # mime = subprocess.Popen(mimecmd, stdout=subprocess.PIPE).communicate()[0]
149  mime = get_mime_data(filename)
150 
151  if mime.strip() == 'data':
152  content = get_hdf5_header_xml(filename)
153  if content:
154  return content
155 
156  if re.search('Hierarchical.*version.4', mime):
157  contents = get_hdf4_content(filename)
158  return contents
159  elif re.search('Hierarchical.*version.5', mime):
160  content = get_hdf5_header_xml(filename)
161  return content
162  elif re.search('NetCDF Data Format', mime):
163  if not (os.path.isfile(ncdump_hdf) and os.access(ncdump_hdf, os.X_OK)):
164  print(ncdump_hdf, "is not executable.")
165  return None
166  cmd = [ncdump_hdf, '-h', filename]
167  hdr_content = subprocess.Popen(cmd, shell=False,
168  stdout=subprocess.PIPE).communicate()
169  return hdr_content[0].decode("utf-8").split('\n')
170  else:
171  fbuffer = open(filename, 'r', 1)
172  try:
173  line1 = fbuffer.readline()
174  fbuffer.close()
175 
176  if re.search("HDF_UserBlock", line1):
177  content = get_hdf5_header_xml(filename)
178  return content
179  elif line1[0:3] == 'CDF':
180  # For NetCDF files, such as some from MERIS
181  cmd = [ncdump, '-h', filename]
182  hdr_content = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=False).stdout
183  return hdr_content.read().decode("utf-8")
184  else:
185  header = []
186  fbuffer = open(filename, 'r', 100)
187  #for line in fbuffer.readlines(100):
188  line = fbuffer.readline()
189  while line:
190  line = line.strip()
191  if len(line):
192  header.append(line)
193  if re.search('LAST_LAST_LONG', line):
194  break
195  line = fbuffer.readline()
196  fbuffer.close()
197  return header
198  except UnicodeDecodeError:
199  return []
200 
201 def readMetadata(filename):
202  """
203  Returns a dictionary containing the metadata for the file named by filename.
204  """
205  # todo: MERIS N1 files?
206  text = dump_metadata(filename)
207  # Added text == [] & changed exit() to sys.exit() -Matt, Feb. 15, 2012
208  # Kept an exit here (instead of making it a return) as already
209  # existing programs assume the output from this function is good.
210  if text is None or text == '':
211  sys.exit("Error! dump_metadata failed.")
212 
213  attrs = None
214 
215  # extract meaningful parts
216  if isinstance(text, list):
217  if text == []:
218  return attrs
219  elif re.search('SENTINEL-2 MSI Level-1C User Product', text[0]):
220  attrs = {}
221  if re.search('2A', text[0]):
222  attrs['platform'] = 'S2A'
223  elif re.search('2B', text[0]):
224  attrs['platform'] = 'S2B'
225  if text[0].find('startTime') != -1:
226  line_parts = text[0].split('safe:startTime>')
227  line_parts2 = line_parts[1].split('Z<')
228  attrs['startTime'] = line_parts2[0].strip()
229  attrs['instrument'] = 'MSI'
230  attrs['processing_level'] = 'L1B'
231  return attrs
232  elif re.search('PRODUCT', text[0]):
233  attrs = {}
234  for line in text:
235  (key, value) = str(line).split('=')
236  attrs[key] = str(value).strip('"')
237  return attrs
238  elif text[0][0:4] == 'CWIF':
239  return {'Title': 'SeaWiFS Level-0'}
240  elif text[0].find('GROUP = L1_METADATA_FILE') != -1:
241  in_metadata_group = False
242  attrs = {}
243  for line in text:
244  if in_metadata_group:
245  if line.find('END_GROUP = PRODUCT_METADATA') != -1:
246  break
247  else:
248  line_parts = line.split('=')
249  attr_key = line_parts[0].strip()
250  attr_val = line_parts[1].strip()
251  attrs[attr_key] = attr_val
252  elif line.find('GROUP = PRODUCT_METADATA') != -1:
253  in_metadata_group = True
254  elif text[0].find('GROUP = LANDSAT_METADATA_FILE') != -1:
255  in_metadata_group = False
256  attrs = {}
257  for line in text:
258  if in_metadata_group:
259  if line.find('END_GROUP = LEVEL1_PROCESSING_RECORD') != -1:
260  break
261  else:
262  line_parts = line.split('=')
263  attr_key = line_parts[0].strip()
264  attr_val = line_parts[1].strip()
265  attrs[attr_key] = attr_val
266  elif line.find('GROUP = PRODUCT_CONTENTS') != -1:
267  in_metadata_group = True
268  elif text[0].find(' = INVENTORYMETADATA') != -1:
269  # in_metadata_group = False
270  in_sensor_group = False
271  in_date_group = False
272  in_time_group = False
273  attrs = {}
274  for line in text:
275  if in_sensor_group and line.find('VALUE') != -1:
276  line_parts = line.split('=')
277  attr_val = line_parts[1].strip().replace('"', '')
278  attrs['instrument'] = attr_val
279  break
280  elif in_date_group and line.find('VALUE') != -1:
281  line_parts = line.split('=')
282  attr_val = line_parts[1].strip().replace('"', '')
283  attrs['startDate'] = attr_val
284  in_date_group = False
285  elif in_time_group and line.find('VALUE') != -1:
286  line_parts = line.split('=')
287  attr_val = line_parts[1].strip().replace('"', '')
288  attrs['startTime'] = attr_val
289  in_time_group = False
290  elif line.find('SENSORSHORTNAME') != -1 and line.find('END_OBJECT') == -1:
291  in_sensor_group = True
292  elif line.find('TIMEOFDAY') != -1 and line.find('END_OBJECT') == -1:
293  in_time_group = True
294  elif line.find('CALENDARDATE') != -1 and line.find('END_OBJECT') == -1:
295  in_date_group = True
296  elif text[0].find('xml') != -1:
297  attrs = {}
298  for line in text:
299  if line.find('SENTINEL-2 MSI Level-1C User Product') != -1:
300  attrs['instrument'] = 'MSI'
301  attrs['processing_level'] = 'L1B'
302  if line.find('safe:startTime>') != -1:
303  line_parts = line.split('safe:startTime>')
304  line_parts2 = line_parts[1].split('<')
305  attrs['startTime'] = line_parts2[0].strip()
306  if line.find('stopTime') != -1:
307  line_parts = line.split('>')
308  line_parts2 = line_parts[1].split('<')
309  attrs['stopTime'] = line_parts2[0].strip()
310  if line.find('<envisat:productName>ENV_ME_1_') != -1:
311  attrs['platform'] = 'ENVISAT'
312  attrs['instrument'] = 'MERIS'
313  attrs['processing_level'] = 'L1B'
314  return attrs
315  if line.find('<sentinel3:productName>S3A_OL_1_ERR') != -1:
316  attrs['platform'] = '3A'
317  attrs['data_type'] = 'ERR'
318  attrs['instrument'] = 'OLCI'
319  attrs['processing_level'] = 'L1B'
320  return attrs
321  if line.find('<sentinel3:productName>S3A_OL_1_EFR') != -1:
322  attrs['platform'] = '3A'
323  attrs['data_type'] = 'EFR'
324  attrs['instrument'] = 'OLCI'
325  attrs['processing_level'] = 'L1B'
326  return attrs
327  if line.find('<sentinel3:productName>S3B_OL_1_ERR') != -1:
328  attrs['platform'] = '3B'
329  attrs['data_type'] = 'ERR'
330  attrs['instrument'] = 'OLCI'
331  attrs['processing_level'] = 'L1B'
332  return attrs
333  if line.find('<sentinel3:productName>S3B_OL_1_EFR') != -1:
334  attrs['platform'] = '3B'
335  attrs['data_type'] = 'EFR'
336  attrs['instrument'] = 'OLCI'
337  attrs['processing_level'] = 'L1B'
338  return attrs
339  if line.find('2A')!= -1 and attrs['instrument'] == 'MSI':
340  attrs['platform'] = 'S2A'
341  return attrs
342  if line.find('2B')!= -1 and attrs['instrument'] == 'MSI':
343  attrs['platform'] = 'S2B'
344  return attrs
345  else:
346  for line in text:
347  if line.find('title = ') != -1:
348  if line.find('Daily-OI') != -1:
349  # NOAA supplied SST Ancillary files
350  return {'Title': 'Ancillary', 'Data Type': 'SST'}
351  elif isinstance(text, bytes) and (text[0:6] == 'netcdf'):
352  attrs = {}
353  lines = text.split('\n')
354  for line in lines:
355  if line.find('=') != -1:
356  fields = line.split('=')
357  key = fields[0]
358  pos = 0
359  while (not fields[0][pos].isalpha()) and pos < len(fields[0]):
360  key = key[1:]
361  pos += 1
362  attrs[key.strip()] = fields[1].strip()
363  return attrs
364  elif isinstance(text, bytes) and (text[0:4] == 'HDF5'):
365  attrs = get_hdf5_attr(text)
366  return attrs
367  # elif isinstance(text, types.StringType) and text[0:4] == 'HDF5':
368  # attrs = get_hdf5_attr(text)
369  elif re.search(r'<\?xml', text) or (text[0:4] == 'HDF5'):
370  # if hdf5 file
371  attrs = get_xml_attr(text)
372  else:
373  #if hdf4 file
374  file_attr_re = re.compile('File attributes:(.+?)\n',
375  re.MULTILINE | re.DOTALL)
376  file_attr_results = file_attr_re.search(text)
377  if file_attr_results != None:
378  file_attr_var_re = re.compile('File attributes:(.+?)\nVariable',
379  re.MULTILINE | re.DOTALL)
380  file_attr_var_results = file_attr_var_re.search(text)
381  if file_attr_var_results != None:
382  allmeta = file_attr_var_results.group(1)
383  # remove spaces around "=" to speed future searches
384  allmeta = re.sub(r'\s*=\s*', '=', allmeta)
385  # parse each file attribute
386  attrs = get_odl_attr(allmeta)
387  else:
388  attrs = \
389  get_attr(text)
390  return attrs
391 
392 def get_attr(text):
393  """
394  :param text: Text containing metadata to be parsed.
395  :return: A dictionary containing metadata attributes.
396  """
397  attrs = {}
398  lines = text.split('\n')
399  attr_pattern = re.compile(r'^\s*Attr\d+: Name = ')
400  value_pattern = re.compile(r'^\s*Value = ')
401  in_attr = False
402  for line in lines:
403  if re.match(attr_pattern, line):
404  in_attr = True
405  attr_name = line.split('=')[1].strip()
406  elif in_attr:
407  if re.match(value_pattern, line):
408  val = str(line).split('=', 1)[1].strip()
409  if attr_name == 'Input Parameters':
410  attrs[attr_name] = {}
411  params = val.split('|')
412  for param in params:
413  parts = param.split('=')
414  if len(parts) == 2:
415  attrs[attr_name][parts[0].strip()] = parts[1].strip()
416  else:
417  attrs[attr_name] = val
418  return attrs
419 
420 def get_hdf5_attr(header_text):
421  """ Returns a Python dictionary containing the file metadata passed from
422  header_text. The dictionary keys will the attribute names and the values
423  will be the data values for the attributes. """
424  attributes = {}
425  attr_regex = re.compile(r'ATTRIBUTE "')
426  data_item_regex = re.compile(r'\(\d+(,\d+)?\): ".+"')
427  data_open_regex = re.compile(r'DATA \{')
428  close_regex = re.compile(r' \}')
429  data_lines = header_text.split('\n')
430  in_attr = False
431  in_data = False
432  for line in data_lines:
433  if attr_regex.search(line):
434  in_attr = True
435  attr_name = re.search(r'ATTRIBUTE "(.+)"', line).group(1)
436  attributes[attr_name] = ''
437  elif data_open_regex.search(line):
438  in_data = True
439  elif in_data:
440  if close_regex.search(line):
441  in_data = False
442  # elif data_item_regex.search(line):
443  elif re.search(r'\(\d+\)\:', line):
444  # data_name = re.search(r'\(\d+(,\d+)?\): "(.+)"', line).group(2)
445  # Because the data fields can start or end with extra spaces
446  # both inside and outside the quotation marks, there are
447  # multiple calls to .strip().
448  the_data = line.split(':')[1].strip().strip('"').strip()
449  attributes[attr_name] = the_data
450  elif in_attr and close_regex.search(line):
451  in_attr = False
452  return attributes
453 
454 def get_odl_attr(metatext):
455  """
456  get interesting bits from ODL formatted metadata
457  """
458  attrs = {}
459  pattern = r'^\s*Attr\d+: Name=(.+?)\s*Type=(.+?)\s*Count=(.+?)\s*Value=(.+?)$'
460  re_attr = re.compile(pattern, re.MULTILINE | re.DOTALL)
461 
462  for att in re_attr.finditer(metatext):
463  name, dtype, count, value = att.groups()
464 
465  if 'char' in dtype:
466  # interpret ASCII codes
467  value = re.sub(r'\\000', '', value) # null
468  value = re.sub(r'\\011', '\t', value) # horizontal tab
469  value = re.sub(r'\\012', '\n', value) # newline
470 
471  else:
472  # add commas between array elements so they'll evaluate correctly
473  if eval(count) > 1:
474  value = ','.join(value.split())
475  # evaluate string to numerical type
476  value = set_type(value)
477 
478  if 'Metadata.' in name:
479  # interpret ODL heirarchy
480  value = parse_odl(value)
481 
482  # add attribute to dictionary
483  attrs[name] = value
484 
485  # eliminate redundant info, then return dictionary.
486  prune_odl(attrs)
487  return attrs
488 
489 def add_xml_group(group, attr):
490  """
491  add xml attributes to attr and decend groups
492  """
493  for node in group:
494  if node.tag == 'Attribute':
495  try:
496  key = node.attrib['Name']
497  val = node.find('Data').find('DataFromFile').text.strip().strip('"')
498  attr[key] = val
499  except:
500  pass
501  elif node.tag == 'Group' or node.tag == 'Dataset':
502  add_xml_group(node, attr)
503 
504 
505 def get_xml_attr(metaxml):
506  """
507  parse xml formatted metadata
508  """
509  import xml.etree.ElementTree as ET
510 
511  attr = {}
512  root = ET.fromstring(metaxml).find('RootGroup')
513  add_xml_group(root, attr)
514  return attr
515 
516 def parse_odl(text):
517  """Recursively extract ODL groups and objects."""
518 
519  # descend into GROUP/OBJECT heirarchy
520  pattern = r"(GROUP|OBJECT)=(.+?)$(.+?)END_\1=\2"
521  re_odl = re.compile(pattern, re.MULTILINE | re.DOTALL)
522  items = {}
523  blocks = re_odl.findall(text)
524  for block in blocks:
525  key = block[1]
526  value = block[2]
527  items[key] = parse_odl(value)
528 
529  # get value(s) at innermost level
530  if not len(list(items.keys())):
531  for line in text.splitlines():
532  get_value(line, items)
533 
534  return items
535 
536 def get_value(text, items=None):
537  """Interpret text as key/value pairs, if possible."""
538  if items is None:
539  items = {}
540  try:
541  key, value = [i.strip() for i in text.split('=', 1)]
542  items[key] = set_type(value)
543  except ValueError:
544  pass
545  return items
546 
547 
548 def set_type(value):
549  """Parse string value into correct type"""
550  try:
551  return eval(value)
552  except (NameError, SyntaxError, TypeError):
553  return value # leave unchanged anything that can't be evaluated
554 
555 
556 def prune_odl(metadict):
557  du.delete_key(metadict, 'StructMetadata.[0-9]')
558  du.delete_key(metadict, '(NUM_VAL|CLASS)')
559  du.promote_value(metadict, '.*VALUE')
560  du.reassign_keys_in_dict(metadict,
561  'ADDITIONALATTRIBUTENAME', 'INFORMATIONCONTENT')
562  du.flatten_dict(metadict)
563  return
def add_xml_group(group, attr)
Definition: MetaUtils.py:489
list(APPEND LIBS ${PGSTK_LIBRARIES}) add_executable(atteph_info_modis atteph_info_modis.c) target_link_libraries(atteph_info_modis $
Definition: CMakeLists.txt:7
def is_metadata_file(mime_data)
Definition: MetaUtils.py:124
def is_ascii_file(filename)
Definition: MetaUtils.py:78
def get_value(text, items=None)
Definition: MetaUtils.py:536
def set_type(value)
Definition: MetaUtils.py:548
def get_xml_attr(metaxml)
Definition: MetaUtils.py:505
def readMetadata(filename)
Definition: MetaUtils.py:201
def prune_odl(metadict)
Definition: MetaUtils.py:556
def is_tar_file(file_path)
Definition: MetaUtils.py:108
def get_hdf4_content(filename)
Definition: MetaUtils.py:13
def get_hdf5_header_xml(filename)
Definition: MetaUtils.py:48
const char * str
Definition: l1c_msi.cpp:35
def is_hdf4(mime_data)
Definition: MetaUtils.py:96
def get_hdf5_attr(header_text)
Definition: MetaUtils.py:420
def get_hdf5_header_plaintext(filename)
Definition: MetaUtils.py:30
#define isalpha(c)
def dump_metadata(filename)
Definition: MetaUtils.py:130
def is_netcdf4(mime_data)
Definition: MetaUtils.py:102
def get_mime_data(filename)
Definition: MetaUtils.py:68
def get_odl_attr(metatext)
Definition: MetaUtils.py:454