7 python2 = sys.version_info.major < 3
12 from urlparse
import urljoin, urlsplit, urlunsplit
14 from urllib.parse
import urljoin, urlsplit, urlunsplit
19 return urlunsplit((parts.scheme, parts.netloc, parts.path,
None,
None))
24 Add query to urljoin() results
31 scheme, netloc, path, query, fragment = urlsplit(base)
32 query = urlsplit(url).query
33 return urlunsplit((scheme, netloc, path, query,
None))
38 Make the dangerous assumption that URLs
39 pointing to another web page always end in '/'.
46 def retry(func, *args, **kwargs):
48 Retry specified function call after a short delay
50 from time
import sleep
51 ntries = kwargs.get(
'ntries')
53 delay =
int(5 + (30. * (1. / (
float(ntries) + 1.))))
54 if kwargs.get(
'verbose'):
55 print(
'Sleeping {}s; {} tries left.'.
format(delay, ntries - 1))
57 kwargs[
'ntries'] = ntries - 1
58 return func(*args, **kwargs)
63 Get function and arguments for caller
66 caller = inspect.stack()[1]
67 func = eval(caller[3])
68 args = inspect.getargvalues(caller[0])
69 values = [args.locals[arg]
for arg
in args.args]
70 arglist = dict(zip(args.args, values))
76 Set modification time for specified file.
77 Set access time to "now".
81 os.utime(filepath, (atime, mtime))
83 os.utime(filepath, times=(atime, mtime))
89 from BeautifulSoup
import BeautifulSoup, SoupStrainer
90 soup = BeautifulSoup(content, parseOnlyThese=SoupStrainer(
'a'))
91 linklist = soup.findAll(
'a', attrs={
'href': re.compile(regex)})
92 linklist = [link.get(
'href')
for link
in linklist]
98 parsed_json = json.loads(content)[
'rows']
99 linklist = [
str(row[0])
for row
in parsed_json]
102 regex = re.compile(regex)
103 linklist = [link
for link
in linklist
if regex.search(link)]
111 for key, value
in response.headers.items():
112 print(
'{}\t= {}'.
format(key, value))
113 print(response.status_code, response.reason)
117 return response
and response.ok
and (
'html' in response.headers[
'Content-Type'])
121 return response
and response.ok
and (
'json' in response.headers[
'Content-Type'])
126 Returns timestamp of remote file as seconds since the epoch.
129 mtime = response.headers[
'Last-Modified']
130 urltime = time.strptime(mtime,
"%a, %d %b %Y %H:%M:%S %Z")
131 return time.mktime(urltime)
132 except Exception
as e:
133 print(
'Exception: {:}'.
format(e))
139 def __init__(self, timeout=5, max_tries=5, verbose=False, clobber=False):
148 Return requests.Session object for specified url.
149 Retries up to self.max_tries times if server is busy.
150 By default, retrieves header only.
167 loc = response.headers.get(
'Location')
170 print(
'redirected to {}'.
format(loc))
178 elif (response.status_code > 499)
and (ntries > 0):
180 print(
'Server busy; will retry {}'.
format(url))
185 print(
'FAILED after {} tries: {}'.
format(ntries, url))
189 print(
'Bad response for {}'.
format(url))
192 except requests.exceptions.Timeout:
195 print(
'Server timeout; will retry {}'.
format(url))
199 except Exception
as e:
200 print(
'Exception: {:}'.
format(e))
207 Returns False if filepath is present and size matches remote url;
208 True otherwise. Optionally check timestamp as well.
215 if not os.path.isfile(filepath):
222 if not (response
and response.ok):
226 diffsize = os.path.getsize(filepath) !=
int(response.headers[
'Content-Length'])
232 older = os.path.getmtime(filepath) <
url_mtime(response)
233 return diffsize
or older
238 with open(filepath,
'wb')
as fd:
239 for chunk
in r.iter_content(chunk_size=512):
243 except Exception
as e:
244 print(
'Exception: {:}'.
format(e))
248 Returns a sorted, unique set of links from a given url.
249 Optionally specify regex to filter for acceptable files;
250 default is to list only links starting with url.
252 response = self.
open_url(url, get=
True)
261 linklist = [
full_url(url, link)
for link
in linklist]
266 linklist = [link
for link
in linklist
if base_url(url)
in link]
269 return sorted(
set(linklist))
272 response=None, clobber=False, dry_run=False):
274 Downloads all available files from a remote url into a local dirpath.
275 Default is to download only if local file doesn't match remote size;
276 set clobber=True to always download.
280 if not (response
and response.ok):
286 if not os.path.exists(dirpath)
and not dry_run:
290 f = os.path.basename(link)
291 filepath = os.path.join(dirpath, f)
293 link, filepath, check_times=check_times):
296 downloaded.append(filepath)
302 def spider(self, url, level=0, visited=None):
313 format(level, url, response.headers[
'Content-Type']))
320 if (
base_url(url)
in link)
and (link
not in visited):
321 visited = self.
spider(link, level=level + 1,
324 print(
'spider {} {}:\t{}'.
325 format(response.status_code, response.reason, url))
327 except Exception
as e:
328 print(
'Exception: {:}'.
format(e))
336 if __name__ ==
'__main__':
338 if len(sys.argv) > 1:
341 url =
'https://oceandata.sci.gsfc.nasa.gov/Ancillary/LUTs/?format=json'
348 logging.basicConfig(format=
'%(levelname)s:%(message)s', level=logging.DEBUG)