2011-03-27 142 views
0


爲了訓練自己的php和HTML形式,我決定製作一個小型web應用程序,該應用程序從其他網站收集數據,但將其顯示爲移動設備。需要分析HTTP表單的幫助

在本次練習中,我選擇了我所在地區的巴士公司網站:http://delijn.be/en/index.htm。我分析了該網站並找到了名爲「form1」的表單,該表單通過POST方法將數據發送到網站http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=en

我開始編寫php代碼,並在互聯網上找到你可以用cURL發送POST字段。所以我做了。不幸的是它不工作。我得到該網站的錯誤頁面。所以我想一些領域必須缺失,但我已經檢查了一切,我找不到另一個領域。這樣我就會再次來到這裏,尋求幫助。

該Web應用程序託管在my home server上,也可以是downloaded

我將非常感激,如果有人可以幫助我解決這個問題,
ief2


PS:代碼的某些部分會被寫入荷蘭語,所以這裏有一些翻譯:

  • Gemeente =城市/城鎮
  • 國家地點=位置
  • Nummer =數字
  • 基準=日期
  • 達格=日
  • Maand =月
  • JAAR =年
  • UUR =小時
  • Aankomst =到達
  • Vertrek =出發
  • Berekenen =計算


PPS:The do wnload鏈接顯然是不行的,但是我沒有問題下載它,所以這裏有一些代碼片段:

的index.php

<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 
<html> 
    <head> 
     <title>De Lijn Mobile</title> 
     <meta name="viewport" content="width = device-width"> 
    </head> 

    <body> 
     <form name="main" action="calculateRoute.php" method="post"> 
      <b>Vertrek:</b><br> 
      Gemeente: <input type="text" name="vertrekGemeente"><br> 
      Straat: <input type="text" name="vertrekStraat"><br> 
      Nummer: <input type="text" name="vertrekNummer"><br> 
      <hr> 
      <b>Aankomst:</b><br> 
      Gemeente: <input type="text" name="aankomstGemeente"><br> 
       Straat: <input type="text" name="aankomstStraat"><br> 
      Nummer: <input type="text" name="aankomstNummer"><br> 
      <hr> 
      <b>Datum:</b><br> 
      <?php 
       require("./Date.php"); 
       $now = new Date(); 
      ?> 
      <input type="radio" name="datumType" value="aankomst" checked> Aankomst<br> 
      <input type="radio" name="datumType" value="vertrek"> Vertrek<br> 
      Dag: <input type="text" size="2" name="datumDag" value="<?php echo $now->day; ?>"><br> 
      Maand: <input type="text" size="2" name="datumMaand" value="<?php echo $now->month; ?>"><br> 
      Jaar: <input type="text" size="4" name="datumJaar" value="<?php echo $now->year; ?>"><br> 
      Tijdstip: <input type="text" size="2" name="datumUur" value="<?php echo $now->hour; ?>"> : 
      <input type="text" size="2" name="datumMinuten" value="<?php echo $now->minutes; ?>"><br> 
      <hr> 
      <input type="submit" value="Bereken"><br> 
     </form> 
    </body> 
</html> 

calculateRoute.php

<DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 
<html> 
    <head> 
     <title>De Lijn Mobile - Berekeningen</title> 
    </head> 

    <body> 
     <?php 
      require_once("./Plaats.php"); 
      require_once("./Date.php"); 
      require_once("./DeLijn.php"); 

      echo "Gathering data...<br>"; 
      $gemeente = $_POST["vertrekGemeente"]; 
      $straat = $_POST["vertrekStraat"]; 
      $nummer = $_POST["vertrekNummer"]; 
      $vertrekPlaats = new Plaats($gemeente, $straat, $nummer); 

      $gemeente = $_POST["aankomstGemeente"]; 
      $straat = $_POST["aankomstStraat"]; 
      $nummer = $_POST["aankomstNummer"]; 
      $aankomstPlaats = new Plaats($gemeente, $straat, $nummer); 

      $datumType = $_POST["datumType"]; 
      $dag = $_POST["datumDag"]; 
      $maand = $_POST["datumMaand"]; 
      $jaar = $_POST["datumJaar"]; 
      $uur = $_POST["datumUur"]; 
      $min = $_POST["datumMinuten"]; 
      $datum = Date::withDate($jaar, $maand, $dag, $uur, $min); 
      $datum->month = $maand; 

      echo "Searching...<br>"; 
      searchDeLijn($vertrekPlaats, 
       $aankomstPlaats, 
       $datumType, 
       $datum); 

     ?> 
    </body> 
</html> 

DeLijn.php

<?php 

require_once("Route.php"); 
require_once("Date.php"); 
require_once("Plaats.php"); 

// ==== Returns of Route objects or null 
define('DATE_ARRIVAL', "aankomst"); 
define('DATE_DEPARTURE', "vertrek"); 
function searchDeLijn($dep, $ar, $dateType, $date) { 
    $vertrekkenOfAankomen = "aankomen"; 
    if(DATE_DEPARTURE === $dateType) { 
     $vertrekkenOfAankomen = "vertrekken"; 
    } 
    $myMins = (int)$date->minutes; 
    $myMins -= ($myMins % 5); 
    $postFields = array(
     "form1:vertrekGemeenteInput" => $dep->gemeente, 
     "form1:vertrekStraatInput" => $dep->straat, 
     "form1:vertrekNrInput" => $dep->nummer, 

     "form1:aankomstGemeenteInput" => $ar->gemeente, 
     "form1:aankomstStraatInput" => $ar->straat, 
     "form1:aankomstNrInput" => $ar->nummer, 

     "form1:vertrekkenOfAankomenRadio" => $vertrekkenOfAankomen, 
     "form1:dagCombo" => (string)(int)$date->day, 
     "form1:maandCombo" => (string)(int)$date->month, 
     "form1:jaarCombo" => $date->year, 
     "form1:uurCombo" => (string)(int)$date->hour, 
     "form1:minutenCombo" => (string)$myMins); 

    print_r($postFields); 

    // do the curl 
    $ch = curl_init(); 
    curl_setopt($ch, CURLOPT_URL, 
     'http://reisinfo.delijn.be/reisinfo/RouteplannerHomeBeperktServlet?taal=nl'); 
    curl_setopt($ch, CURLOPT_POST, 1); 
    curl_setopt($ch, CURLOPT_POSTFIELDS, $postFields); 
    curl_setopt($ch, CURLOPT_HEADER, 0); 
    curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); 

    $contents = curl_exec($ch); 
    curl_close($ch); 
    if($contents == false) { 
     return null; 
    } 

    echo $contents; 

    $myRouteObjects = extractRoutesFromXMLData($contents); 
    return $myRouteObjects; 
} 

// ==== Returns array of Route objects or null 
function extractRoutesFromXMLData($dataString) { 
    $tableBody = getResultsTableBody($dataString); 
    if($tableBody != null) { return null; } 

    $tableRows = getTableRowsOfTableBody($tableBody); 
    if($tableRows != null) { return null; } 

    // put them in an array 
    $myArray = array(); 
    $count = $tableRows->length; 
    for($i = 0; $i < $count; $i++) { 
     $aNode = $tableRows->item(0); 
     $myArray[] = $aNode; 
    } 

    return $myArray; 
} 

// ==== Returns XMLDocument or null 
function getResultsTableBody($dataString) { 
    // Get table element 
    $status = preg_match('/<TABLE id="routeplanner_overzicht".*?>.*?<\/TABLE>/is', 
      $docString, $matches); 
    if($status == 0) { 
     return null; 
    } 

    $tableElement = $matches[0]; 

    // Extract body 
    $status = preg_match('/<TBODY>.*?<\/TBODY>/is', 
      $tableElement, $matches); 
    if($status == 0) { 
     return null; 
    } 

    $doc = new DOMDocument(); 
    $doc->loadXML($matches[0]); 

    return $doc; 
} 


// ==== Retunrs XMLNodeList or null 
function getTableRowsOfTableBody($xmlDoc) { 
    $xpath = new DOMXPath($domDoc); 
    $xpathres = $xpath->evaluate("//tbody[0]/tr"); 
    if($xpathres == false) { 
     return null; 
    } 

    return $xpathres; 
} 
?> 

Date.phpPlaats.phpRoute.php都包含分別封裝日期,位置和可能路線的類。

+0

'document.forms [1] .elements.length '說有14個,但你​​只列出11個。還有一些網站拒絕沒有cookie的操作(一個跟蹤和兩個javascript測試)。檢查Firebug網絡流量。 – mario 2011-03-27 14:58:09

+0

我的確忘記了其中的一個(有一個廣播),但是你在看'Route.php'這是結果解析器,它有一個11列的表格。但無論如何,我檢查了餅乾,我認爲你是對的。當我禁用cookie時,這些網站會顯示「Uw sessie is verlopen」(=「您的會話已過期」)。現在有可能手工製作這樣的曲奇嗎?或者有另一種解決這個問題的方法嗎? – v1Axvw 2011-03-27 15:37:43

+0

沒有看你的代碼。下載鏈接非實用。最好在這裏粘貼摘錄。 - cURL允許以某種方式設置cookie,參見各種'CURLOPT_COOKIE *'標誌。我認爲這是這類任務最常見的問題。 – mario 2011-03-27 15:42:06

回答

0

字段丟失,服務器真的回覆奇怪發佈數據。我只能自動化一個頁面。點擊其他鏈接,餅乾顯然是不夠的。

我已經寫了一些代碼,這可能是有用的一些其他人需要知道一個表格佈局:

HTMLFormExtractor.py

#!/usr/bin/python 
import sys 
import getopt 
import urllib 
import re 

# ############################ 
# This code may be used by anyone. It may be used in both free 
# and commercial software. It may be copied, modified and even 
# be sold. The creator of this code takes no responsibility for 
# any damage this script could do. 
# ############################ 

# ############################ 
# ############################ 
# Usage: ./exec [-x] [URL] 
# 
# This application logs all forms of an HTML document and it's 
# objects which have the HTML 'name'-attribute set. The program 
# currently only works when the attributes of the objects are 
# styled like the XML format (eg: name="myname"). 
# 
# Options: 
# -x: Create an XML document of the following form: 
#   ==== BEGIN XML ==== 
#   formlist 
#    form (variable) 
#     attribute (variable) 
#      name 
#      value 
# 
#     object (variable) 
#      type (eg: input) 
#      name (eg: username) 
#   ==== END XML ==== 
# 
# URL: a URL pointing to an available, HTML file. If it's not 
#  specified specified the program will read the HTML document 
#  from the standard input. 
# 
# ############################ 

# ===== DATA ===== 
global FORM_OBJECTS_TAG_NAME 
FORM_OBJECTS_TAG_NAME = ("input", 
    "textarea", 
    "label", 
    "fieldset", 
    "legend", 
    "select", 
    "optgroup", 
    "option", 
    "button") 



# ===== CLASSES ===== 
class HTMLAttribute: 
    def __init__(self, name, value, orString = None): 
     self.name = name 
     self.value = value 
     self.originalString = None 

    @classmethod 
    def withAttributeString(cls, string): 
     """Takes a string of the form attrNam="value" """ 
     attrNameRegex = "\w+=" 
     attrName = re.findall(attrNameRegex, string)[0] 
     attrName = attrName[0:len(attrName)-1] 

     valueRegex = "[\"'].*?[\"']" 
     value = re.findall(valueRegex, string)[0] 
     value = value[1:len(value)-1] 

     return cls(attrName, value, string) 

class HTMLObject: 
    def __init__(self, aName): 
     self.name = aName 
     self.attributes = [] # contains HTMLAttribute 

    def addAttribute(self, anAttribute): 
     self.attributes.append(anAttribute) 

    def getAttributeWithName(self, aName): 
     """Returns none or an HTLMAttribute""" 
     aName = aName.lower() 
     for anAttribute in self.attributes: 
      if anAttribute.name.lower() == aName: return anAttribute 
     return None 

    @classmethod 
    def withTagString(cls, string): 
     """Takes a string of the form <aTagName attrName="value" ... >""" 
     tagOnyRegex = "<.*?>" 
     regObj = re.compile(tagOnyRegex, re.S) 
     string = re.findall(regObj, string)[0] 

     tagNameRegex = "(?<=<)\w+[\s>]" 
     tagName = re.findall(tagNameRegex, string)[0] 
     tagName = tagName[0:len(tagName)-1] 

     attrRegex = "\w+=[\"'].*?[\"']" 
     allAttributes = re.findall(attrRegex, string) 

     myObj = cls(tagName) 
     for anAttrString in allAttributes: 
      attrObj = HTMLAttribute.withAttributeString(anAttrString) 
      myObj.addAttribute(attrObj) 

     return myObj 

class HTMLForm: 
    def __init__(self, name, htmlObjects): 
     self.name = name 
     self.HTMLObjects = htmlObjects # list of HTMLObject 

# ===== FUNCTIONS ===== 
def getFormsFromHTML(htmlData): 
    regex = re.compile("<form.*?>.*?</form>", re.IGNORECASE | re.S) 
    result = re.findall(regex, htmlData) 
    return result 

def getFormObjects(aForm): 
    """Returns a list of HTMLObjects""" 
    global FORM_OBJECTS_TAG_NAME 
    myRegex = "<(?:" 
    myOrRegexLen = len(myRegex) 
    for aTagName in FORM_OBJECTS_TAG_NAME: 
     myRegex += aTagName + "|" 
    if len(myRegex) == myOrRegexLen: return [] 

    myRegex = myRegex[0:len(myRegex)-1] 
    myRegex += ").*?>" 

    regObj = re.compile(myRegex, re.S | re.I) 
    allObjects = re.findall(regObj, aForm) 

    foundObjects = [] 
    for anObject in allObjects: 
     anObj = HTMLObject.withTagString(anObject) 
     foundObjects.append(anObj) 

    return foundObjects 

def printForms(foundForms, foundObjects): 
    """Pass on a list of HTMLObject and a list of lists of HTMLObjects 
    The first list are the forms the second are the objects contained by 
    the forms at the corresponding index of the first list.""" 
    counter = 0 
    for aForm in foundForms: 
     print "===== FORM " + str(counter+1) + " =====" 

     print "\tATTRIBUTES:" 
     for anAttribute in aForm.attributes: 
      print "\t\t" + anAttribute.name + ": '" + anAttribute.value + "'" 

     print "\n\t" + str(len(foundObjects)) + " OBJECTS:" 
     for anObject in foundObjects[counter]: 
      nameAttribute = anObject.getAttributeWithName("name") 
      if nameAttribute != None: 
       print "\t\t" + anObject.name + " (name=\"" + nameAttribute.value + "\")" 

     print "\n" 
     counter += 1 


def createXMLString(foundForms, foundObjects): 
    """Pass on a list of HTMLObject and a list of lists of HTMLObjects 
    The first list are the forms the second are the objects contained by 
    the forms at the corresponding index of the first list. 

    XML: 
     formlist 
      form (mult) 
       attribute (mult) 
        name 
        value 

       object (mult) 
        type (eg: input) 
        name (eg: username) 
    """ 
    counter = 0 
    xmlString = "<formlist>\n" 
    for aForm in foundForms: 
     # make form child 
     formXMLChild = "\t<form>\n" 

     # add all attributes 
     for anAttr in aForm.attributes: 
      formXMLChild += "\t\t<attribute>\n" 
      formXMLChild += "\t\t\t<name>" + anAttr.name + "</name>\n" 
      formXMLChild += "\t\t\t<value>" + anAttr.value + "</value>\n" 
      formXMLChild += "\t\t</attribute>\n" 

     # add all input objects if they have a name 
     for anObject in foundObjects[counter]: 
      nameAttr = anObject.getAttributeWithName("name") 
      if nameAttr != None: 
       formXMLChild += "\t\t<object>\n" 
       formXMLChild += "\t\t\t<type>" + anObject.name + "</type>\n" 
       formXMLChild += "\t\t\t<name>" + nameAttr.value + "</name>\n" 
       formXMLChild += "\t\t</object>\n" 

     # end child and append 
     formXMLChild += "\t<form>\n\n" 
     xmlString += formXMLChild 
     counter += 1 

    # end xml and return the string 
    xmlString = xmlString[0:len(xmlString)-1] + "</formlist>\n" 
    return xmlString 


# ===== MAIN ===== 
# Parse the command line options 
userArgv = sys.argv[1:] 
flags, arguments = getopt.getopt(userArgv, "x") 
wantsXMLFormat = flags.count(('-x', '')) > 0 
hasURL = len(arguments) > 0; 

# Get the HTML data 
myHTML = None; 
if hasURL: 
    myURL = arguments[0]; 
    urlHandle = urllib.urlopen(myURL) 
    if urlHandle == None: 
     print "Failed to open the URL" 
     sys.exit(1) 
    myHTML = urlHandle.read() 
    urlHandle.close() 

else: 
    myHTML = sys.stdin.read() 

# Get all forms 
htmlForms = getFormsFromHTML(myHTML) 

# Loop with all forms 
foundForms = [] 
foundObjects = [] # list of list 
for aFormTag in htmlForms: 
    # append the form 
    formChilds = getFormObjects(aFormTag) 
    formHTMLObject = HTMLObject.withTagString(aFormTag) 
    foundForms.append(formHTMLObject) 

    # append a form input object 
    allObjects = getFormObjects(aFormTag) 
    foundObjects.append(allObjects) 


# Print or create xml 
if not wantsXMLFormat: 
    printForms(foundForms, foundObjects) 
else: 
    myXMLString = createXMLString(foundForms, foundObjects) 
    print myXMLString