OB.DAAC Logo
NASA Logo
Ocean Color Science Software

ocssw V2022
SessionUtils.py
Go to the documentation of this file.
1 import os
2 import sys
3 import time
4 import re
5 import requests
6 
7 python2 = sys.version_info.major < 3
8 
9 # URL parsing utils:
10 
11 if python2:
12  from urlparse import urljoin, urlsplit, urlunsplit
13 else: # python 3
14  from urllib.parse import urljoin, urlsplit, urlunsplit
15 
16 
17 def base_url(url):
18  parts = urlsplit(url)
19  return urlunsplit((parts.scheme, parts.netloc, parts.path, None, None))
20 
21 
22 def full_url(url, link):
23  """
24  Add query to urljoin() results
25  ONLY if it's a page
26  """
27  base = base_url(urljoin(url, link))
28  if not is_page(base):
29  return base
30  else:
31  scheme, netloc, path, query, fragment = urlsplit(base)
32  query = urlsplit(url).query
33  return urlunsplit((scheme, netloc, path, query, None))
34 
35 
36 def is_page(url):
37  """
38  Make the dangerous assumption that URLs
39  pointing to another web page always end in '/'.
40  """
41  return base_url(url).endswith('/')
42 
43 
44 # General utils:
45 
46 def retry(func, *args, **kwargs):
47  """
48  Retry specified function call after a short delay
49  """
50  from time import sleep
51  ntries = kwargs.get('ntries')
52  if ntries:
53  delay = int(5 + (30. * (1. / (float(ntries) + 1.))))
54  if kwargs.get('verbose'):
55  print('Sleeping {}s; {} tries left.'.format(delay, ntries - 1))
56  sleep(delay)
57  kwargs['ntries'] = ntries - 1
58  return func(*args, **kwargs)
59 
60 
61 def thiscall():
62  """
63  Get function and arguments for caller
64  """
65  import inspect
66  caller = inspect.stack()[1]
67  func = eval(caller[3]) # function object
68  args = inspect.getargvalues(caller[0]) # frame
69  values = [args.locals[arg] for arg in args.args]
70  arglist = dict(zip(args.args, values)) # all as keyword args
71  return func, arglist
72 
73 
74 def set_mtime(filepath, mtime):
75  """
76  Set modification time for specified file.
77  Set access time to "now".
78  """
79  atime = time.time()
80  if python2:
81  os.utime(filepath, (atime, mtime))
82  else:
83  os.utime(filepath, times=(atime, mtime))
84 
85 
86 # URL content parsing utils:
87 
88 def getlinks_html(content, regex=''):
89  from BeautifulSoup import BeautifulSoup, SoupStrainer
90  soup = BeautifulSoup(content, parseOnlyThese=SoupStrainer('a'))
91  linklist = soup.findAll('a', attrs={'href': re.compile(regex)})
92  linklist = [link.get('href') for link in linklist]
93  return linklist
94 
95 
96 def getlinks_json(content, regex=''):
97  import json
98  parsed_json = json.loads(content)['rows']
99  linklist = [str(row[0]) for row in parsed_json]
100  if regex != '':
101  import re
102  regex = re.compile(regex)
103  linklist = [link for link in linklist if regex.search(link)]
104  return linklist
105 
106 
107 # requests.Response utils:
108 
109 def print_response(response):
110  if response:
111  for key, value in response.headers.items():
112  print('{}\t= {}'.format(key, value))
113  print(response.status_code, response.reason)
114 
115 
116 def is_html(response):
117  return response and response.ok and ('html' in response.headers['Content-Type'])
118 
119 
120 def is_json(response):
121  return response and response.ok and ('json' in response.headers['Content-Type'])
122 
123 
124 def url_mtime(response):
125  """
126  Returns timestamp of remote file as seconds since the epoch.
127  """
128  try:
129  mtime = response.headers['Last-Modified']
130  urltime = time.strptime(mtime, "%a, %d %b %Y %H:%M:%S %Z")
131  return time.mktime(urltime)
132  except Exception as e:
133  print('Exception: {:}'.format(e))
134  return sys.maxsize
135 
136 
138 
139  def __init__(self, timeout=5, max_tries=5, verbose=False, clobber=False):
140  self.timeout = timeout
141  self.max_tries = max_tries
142  self.verbose = verbose
143  self.clobber = clobber
144  self.session = requests.Session()
145 
146  def open_url(self, url, ntries=None, get=False):
147  """
148  Return requests.Session object for specified url.
149  Retries up to self.max_tries times if server is busy.
150  By default, retrieves header only.
151  """
152  if not ntries:
153  ntries = self.max_tries
154  response = None
155 
156  try:
157  if get:
158  response = self.session.get(url, timeout=self.timeout)
159  else:
160  response = self.session.head(url, timeout=self.timeout)
161  # if self.verbose:
162  # print('{}\t{}\t{}'.format(
163  # response.status_code, url, response.headers['Content-Type']))
164 
165  # redirect as needed
166  # TODO: get new url back to caller
167  loc = response.headers.get('Location')
168  if loc: # response.is_redirect:
169  if self.verbose:
170  print('redirected to {}'.format(loc))
171  response = self.open_url(self, loc)
172 
173  # return response if okay
174  if response.ok:
175  pass
176 
177  # retry if server is busy
178  elif (response.status_code > 499) and (ntries > 0):
179  if self.verbose:
180  print('Server busy; will retry {}'.format(url))
181  response = retry(self.open_url, url, ntries=ntries, get=get)
182 
183  # give up if too many tries
184  elif ntries == 0:
185  print('FAILED after {} tries: {}'.format(ntries, url))
186 
187  # give up if bad response
188  else:
189  print('Bad response for {}'.format(url))
190  print_response(response)
191 
192  except requests.exceptions.Timeout:
193  if ntries > 0:
194  if self.verbose:
195  print('Server timeout; will retry {}'.format(url))
196  response = retry(self.open_url, url, ntries=ntries, get=get)
197  pass
198 
199  except Exception as e:
200  print('Exception: {:}'.format(e))
201 
202  finally:
203  return response
204 
205  def needs_download(self, url, filepath, check_times=False, response=None):
206  """
207  Returns False if filepath is present and size matches remote url;
208  True otherwise. Optionally check timestamp as well.
209  """
210 
211  # only download files
212  if is_page(url):
213  return False
214 
215  if not os.path.isfile(filepath):
216  # if self.verbose:
217  # print('Local file not found:', filepath)
218  return True
219 
220  if not response:
221  response = self.open_url(url)
222  if not (response and response.ok):
223  return False
224 
225  # check file size
226  diffsize = os.path.getsize(filepath) != int(response.headers['Content-Length'])
227  if not check_times:
228  return diffsize
229 
230  # optionally check timestamp
231  else:
232  older = os.path.getmtime(filepath) < url_mtime(response)
233  return diffsize or older
234 
235  def download_file(self, url, filepath):
236  try:
237  r = self.session.get(url, timeout=self.timeout, stream=True)
238  with open(filepath, 'wb') as fd:
239  for chunk in r.iter_content(chunk_size=512):
240  fd.write(chunk)
241  response = self.open_url(url)
242  set_mtime(filepath, url_mtime(response))
243  except Exception as e:
244  print('Exception: {:}'.format(e))
245 
246  def list_pageurls(self, url, regex=''):
247  """
248  Returns a sorted, unique set of links from a given url.
249  Optionally specify regex to filter for acceptable files;
250  default is to list only links starting with url.
251  """
252  response = self.open_url(url, get=True)
253  if is_html(response):
254  linklist = getlinks_html(response.text, regex)
255  elif is_json(response):
256  linklist = getlinks_json(response.text, regex)
257  else:
258  return []
259 
260  # get full url
261  linklist = [full_url(url, link) for link in linklist]
262 
263  # if no filter, return only links containing url
264  # TODO: skip original url, and urls ending in "/"
265  if regex == '':
266  linklist = [link for link in linklist if base_url(url) in link]
267 
268  # return sorted, unique list
269  return sorted(set(linklist))
270 
271  def download_allfiles(self, url, dirpath, regex='', check_times=False,
272  response=None, clobber=False, dry_run=False):
273  """
274  Downloads all available files from a remote url into a local dirpath.
275  Default is to download only if local file doesn't match remote size;
276  set clobber=True to always download.
277  """
278  if not response:
279  response = self.open_url(url)
280  if not (response and response.ok):
281  return []
282 
283  downloaded = []
284  if dry_run and self.verbose:
285  print('Dry run:')
286  if not os.path.exists(dirpath) and not dry_run:
287  os.makedirs(dirpath)
288 
289  for link in self.list_pageurls(url, regex=regex):
290  f = os.path.basename(link)
291  filepath = os.path.join(dirpath, f)
292  if clobber or self.needs_download(
293  link, filepath, check_times=check_times):
294  if not dry_run:
295  self.download_file(link, filepath)
296  downloaded.append(filepath)
297  if self.verbose:
298  print('+ ' + f)
299 
300  return downloaded
301 
302  def spider(self, url, level=0, visited=None):
303  """
304  Demo crawler
305  """
306  if visited is None:
307  visited = []
308  try:
309  response = self.open_url(url)
310  if response.ok:
311  if self.verbose:
312  print('{}\t{}\t{}'.
313  format(level, url, response.headers['Content-Type']))
314  else:
315  print(url)
316  visited.append(url)
317 
318  if is_page(url):
319  for link in self.list_pageurls(url):
320  if (base_url(url) in link) and (link not in visited):
321  visited = self.spider(link, level=level + 1,
322  visited=visited)
323  else:
324  print('spider {} {}:\t{}'.
325  format(response.status_code, response.reason, url))
326 
327  except Exception as e:
328  print('Exception: {:}'.format(e))
329 
330  finally:
331  return visited
332 
333 # end of class SessionUtils
334 
335 
336 if __name__ == '__main__':
337  # parameters
338  if len(sys.argv) > 1:
339  url = sys.argv[1]
340  else:
341  url = 'https://oceandata.sci.gsfc.nasa.gov/Ancillary/LUTs/?format=json'
342 
343  # logging
344  debug = False # True
345  if debug:
346  import logging
347 
348  logging.basicConfig(format='%(levelname)s:%(message)s', level=logging.DEBUG)
349 
350  # init session, run crawler
351  s = SessionUtils(verbose=True)
352  s.spider(url)
def __init__(self, timeout=5, max_tries=5, verbose=False, clobber=False)
def getlinks_html(content, regex='')
Definition: SessionUtils.py:88
@ head
Definition: dataday.h:37
def full_url(url, link)
Definition: SessionUtils.py:22
def list_pageurls(self, url, regex='')
def download_file(self, url, filepath)
def print_response(response)
subroutine func(x, conec, n, bconecno, bn, units, u, inno, i, outno, o, Input, Targ, p, sqerr)
Definition: ffnet.f:287
def set_mtime(filepath, mtime)
Definition: SessionUtils.py:74
def open_url(self, url, ntries=None, get=False)
const char * str
Definition: l1c_msi.cpp:35
def needs_download(self, url, filepath, check_times=False, response=None)
def spider(self, url, level=0, visited=None)
def download_allfiles(self, url, dirpath, regex='', check_times=False, response=None, clobber=False, dry_run=False)
def getlinks_json(content, regex='')
Definition: SessionUtils.py:96
def retry(func, *args, **kwargs)
Definition: SessionUtils.py:46