2017-10-21 273 views
0

我一直在Python代碼中搜索並從NSIDC https網站下載SMAP衛星數據。我的代碼是工作,直到上週開始時的缺陷:如何修復https NSIDC/NASA網站的連接?

urllib2.HTTPError: HTTP Error 404: Not Found

任何幫助嗎?

該代碼是從一個NSIDC網站提出的改編,以完成我所需要的。下面的例子:

"""This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory. 

This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python 
Last edited Jan 26, 2017 G. Deemer""" 

import urllib2 
import os 
from cookielib import CookieJar 
from HTMLParser import HTMLParser 

# Define a custom HTML parser to scrape the contents of the HTML data table 
class MyHTMLParser(HTMLParser): 
    def __init__(self): 
     HTMLParser.__init__(self) 
     self.inLink = False 
     self.dataList = [] 
     self.directory = '/' 
     self.indexcol = ';' 
     self.Counter = 0 

    def handle_starttag(self, tag, attrs): 
     self.inLink = False 
     if tag == 'table': 
      self.Counter += 1 
     if tag == 'a': 
      for name, value in attrs: 
       if name == 'href': 
        if self.directory in value or self.indexcol in value: 
         break 
        else: 
         self.inLink = True 
         self.lasttag = tag 

    def handle_endtag(self, tag): 
      if tag == 'table': 
       self.Counter +=1 

    def handle_data(self, data): 
     if self.Counter == 1: 
      if self.lasttag == 'a' and self.inLink and data.strip(): 
       self.dataList.append(data) 

parser = MyHTMLParser() 

# Define function for batch downloading 
def BatchJob(Files, cookie_jar): 
    for dat in Files: 
     print "downloading: ", dat 
     JobRequest = urllib2.Request(url+dat) 
     JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request 
     JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401' 

     # Request the resource at the modified redirect url 
     Request = urllib2.Request(JobRedirect_url) 
     Response = urllib2.urlopen(Request) 
     f = open(dat, 'wb') 
     f.write(Response.read()) 
     f.close() 
     Response.close() 
    print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__)) 
#=========================================================================== 
# The following code block is used for HTTPS authentication 
#=========================================================================== 

# The user credentials that will be used to authenticate access to the data 
username = "user" 
password = "password" 

# The FULL url of the directory which contains the files you would like to bulk download 

url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL 
# Create a password manager to deal with the 401 reponse that is returned from 
# Earthdata Login 

password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 
password_manager.add_password(None, "https://urs.earthdata.nasa.gov", username, password) 

# Create a cookie jar for storing cookies. This is used to store and return 
# the session cookie given to use by the data server (otherwise it will just 
# keep sending us back to Earthdata Login to authenticate). Ideally, we 
# should use a file based cookie jar to preserve cookies between runs. This 
# will make it much more efficient. 

cookie_jar = CookieJar() 

# Install all the handlers. 
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager), 
    #urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see 
    #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses 
    urllib2.HTTPCookieProcessor(cookie_jar)) 
urllib2.install_opener(opener) 

# Create and submit the requests. There are a wide range of exceptions that 
# can be thrown here, including HTTPError and URLError. These should be 
# caught and handled. 

#=========================================================================== 
# Open a requeset to grab filenames within a directory. Print optional 
#=========================================================================== 

DirRequest = urllib2.Request(url) 
DirResponse = urllib2.urlopen(DirRequest) 

# Get the redirect url and append 'app_type=401' 
# to do basic http auth 
DirRedirect_url = DirResponse.geturl() 
DirRedirect_url += '&app_type=401' 

# Request the resource at the modified redirect url 
DirRequest = urllib2.Request(DirRedirect_url) 
DirResponse = urllib2.urlopen(DirRequest) 

DirBody = DirResponse.read(DirResponse) 

# Uses the HTML parser defined above to pring the content of the directory containing data 
parser.feed(DirBody) 
Files = parser.dataList 

# Display the contents of the python list declared in the HTMLParser class 
# print Files #Uncomment to print a list of the files 

#========================================================================= 
# Call the function to download all files in url 
#========================================================================= 

BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory 
+0

爲什麼不使用'url'作爲你的請求? 'DirRedirect_url'產生404響應。 –

+0

@ t.m.adam,因爲代碼使用網址庫來查找網站中每個文件夾內的文件。因爲這樣,有這樣的結構,檢查網站,獲取可用的文件和下載數據。 –

回答

0

我可以通過直接加載網站並選擇要下載的圖像來修復該錯誤。如上面的代碼。

"""This script, NSIDC_parse_HTML_BatchDL.py, defines an HTML parser to scrape data files from an earthdata HTTPS URL and bulk downloads all files to your working directory. 

This code was adapted from https://wiki.earthdata.nasa.gov/display/EL/How+To+Access+Data+With+Python Last edited Jan 26, 2017 G. Deemer""" 

import urllib2 
import os 
from cookielib import CookieJar 


# Define function for batch downloading 
def BatchJob(Files, cookie_jar): 
    for dat in Files: 
     print "downloading: ", dat 
     JobRequest = urllib2.Request(url+dat) 
     JobRequest.add_header('cookie', cookie_jar) # Pass the saved cookie into additional HTTP request 
    JobRedirect_url = urllib2.urlopen(JobRequest).geturl() + '&app_type=401' 

    # Request the resource at the modified redirect url 
    Request = urllib2.Request(JobRedirect_url) 
    Response = urllib2.urlopen(Request) 
    f = open(dat, 'wb') 
    f.write(Response.read()) 
    f.close() 
    Response.close() 
print "Files downloaded to: ", os.path.dirname(os.path.realpath(__file__)) 
#========================================================================== 
# The following code block is used for HTTPS authentication 
#========================================================================== 

# The user credentials that will be used to authenticate access to the data 
username = "user" 
password = "password" 

# The FULL url of the directory which contains the files you would like to bulk download 

url = "https://n5eil01u.ecs.nsidc.org/SMAP/SPL4SMGP.003/2017.10.14/" # Example URL 
# Create a password manager to deal with the 401 reponse that is returned from # Earthdata Login 

password_manager = urllib2.HTTPPasswordMgrWithDefaultRealm() 
password_manager.add_password(None, 
           "https://urs.earthdata.nasa.gov", 
           username, password) 

# Create a cookie jar for storing cookies. This is used to store and return 
# the session cookie given to use by the data server (otherwise it will just 
# keep sending us back to Earthdata Login to authenticate). Ideally, we 
# should use a file based cookie jar to preserve cookies between runs. This 
# will make it much more efficient. 

cookie_jar = CookieJar() 

# Install all the handlers. 
opener = urllib2.build_opener(
    urllib2.HTTPBasicAuthHandler(password_manager), 
    #urllib2.HTTPHandler(debuglevel=1), # Uncomment these two lines to see 
    #urllib2.HTTPSHandler(debuglevel=1), # details of the requests/responses 
    urllib2.HTTPCookieProcessor(cookie_jar)) 
urllib2.install_opener(opener) 

# Create and submit the requests. There are a wide range of exceptions that 
# can be thrown here, including HTTPError and URLError. These should be 
# caught and handled. 

#=========================================================================== 
# Open a requeset to grab filenames within a directory. Print optional 
#=========================================================================== 

DirResponse = urllib2.urlopen(url) 
htmlPage = DirResponse.read() 

listFiles = [x.split(">")[0].replace('"', "") 
        for x in htmlPage.split("><a href=") if x.split(">")[0].endswith('.h5"') == True] 

# Display the contents of the python list declared in the HTMLParser class 
# print Files #Uncomment to print a list of the files 

#========================================================================= 
# Call the function to download all files in url 
#========================================================================= 

BatchJob(Files, cookie_jar) # Comment out to prevent downloading to your working directory