2013-01-01 29 views
2

當我連接到一些網站,它給了我:的WinSock分塊數據編碼

Content-Type: text/html; charset=ISO-8859-1 

Connection: close 

Transfer-Encoding: chunked 

Date: Tue, 01 Jan 2013 18:49:53 GMT 


fff8 

,並在文件的結尾,它看起來像:

</script><!-- vBadvanced 1-3-9-4-8-0 --> 

</body> 
</html 

1 

> 

0 

但是當我做計算器。 com,它打印出來的效果非常好..它可能在源代碼中有額外的空白行,但這很好..爲什麼其他網站會添加數字?

我該如何解決?另外,我怎樣才能從html本身分離出這個頭文件?

我的代碼如下:

#define _WIN32_WINNT 0x501 

#include <iostream> 
#include <winsock2.h> 
#include <ws2tcpip.h> 
#include <stdio.h> 
#include <fstream> 
#include <vector> 

using namespace std; 

void Get(string WebPage) 
{ 
    WSADATA wsaData; 
    string Address; 
    struct addrinfo *result; 
    struct sockaddr_in *sockaddr_ipv4; 

    char Buffer[99000]; 

    string Header = "GET/HTTP/1.1\r\n"; 
    Header += "Host: " + WebPage + "\r\n"; 
    Header += "Connection: close\r\n"; 
    Header += "\r\n"; 

    if (WSAStartup(MAKEWORD(2,2), &wsaData) != 0) return; 

    SOCKET Socket = socket(AF_INET,SOCK_STREAM,IPPROTO_TCP); 

    getaddrinfo(WebPage.c_str(), NULL, NULL, &result); 
    if (result->ai_family == AF_INET) 
    { 
     sockaddr_ipv4 = (struct sockaddr_in *) result->ai_addr; 
     Address = inet_ntoa(sockaddr_ipv4->sin_addr); 
    } 
    freeaddrinfo(result); 


    SOCKADDR_IN SockAddr; 
    memset(&SockAddr, 0, sizeof(SockAddr)); 
    SockAddr.sin_port = htons(80); 
    SockAddr.sin_family = AF_INET; 
    SockAddr.sin_addr.s_addr = inet_addr(Address.c_str()); 

    if(connect(Socket,(SOCKADDR*)(&SockAddr),sizeof(SockAddr)) == SOCKET_ERROR) return; 

    if (send(Socket, Header.c_str(), Header.size(), 0) == SOCKET_ERROR) return; 
    shutdown(Socket, SD_SEND); 

    std::string Response; 

    int bytes = 1; 
    while (bytes > 0) 
    { 
     bytes = recv(Socket, Buffer, sizeof(Buffer), 0); 
     Buffer[bytes] = '\0'; 
     Response.append(Buffer, bytes); 
    }; 

    closesocket(Socket); 
    WSACleanup(); 
} 

int main() 
{ 
    Get("google.com"); 
} 
+1

無關,但你不檢查'recv'調用的錯誤。 –

+0

如果您使用C或C++在Windows上執行HTTP,則應該查看WinInet(http://msdn.microsoft.com/en-us/library/windows/desktop/aa385331.aspx)或WinHTTP(http: //msdn.microsoft.com/en-us/library/windows/desktop/aa384273.aspx) –

回答

3

見本wiki頁面:http://en.wikipedia.org/wiki/Chunked_transfer_encoding

這些十六進制數(塊長度)中的每一個遵循由指定的大小的實際數據塊的數據(有效載荷) ,緊接着是另一個塊長度。如果塊長度爲零,則不會有更多的數據字節(eof)。這些元素由換行符分隔。 我不確定,您發佈的內容是否可以正確鏈接,看起來,您需要處理多個連續的換行符。只需在瀏覽器中查看該頁面及其源代碼即可。

編輯:

剛剛發現這個嗅探工具,它顯示了所有我想在你的情況知道的細節:

http://web-sniffer.net/

+0

我試圖通過CRLF分割響應字符串,仍然不知道如何找到哪個塊是哪個或如何刪除頭數據從文件的頂部..雖然沒關係。我會繼續嘗試。 – Brandon

+0

每個頭部屬性都以CRLF結束,頭部本身由另一個CRLF終止(可以被視爲一個空的頭部屬性)。之後,您應該處於第一個塊長度(fff8)的位置。看起來,複製數據時你有一個額外的換行符(也許是一個轉換問題,其中CR和LF有單獨的換行符)。 – Sam

+0

請注意,分塊編碼允許服務器省略「Content-Length」屬性。您應該對最後的零塊長度或另一個Transfer-Encoding的標題中指定的大小執行檢查。 – Sam

-1

此功能將「unchunk」你的HTTP數據 - 在VB6中,但你會得到這個想法(真的很老的代碼)

Private Function UnChunk(Indata As String) As String 
    If InStr(LCase(Indata), "transfer-encoding:") = 0 And InStr(LCase(Indata), "chunked") = 0 Then 
    'not chunked, so return the input 
    UnChunk = Indata 
    Exit Function 
    End If 
    'can't let this crash 
    On Error GoTo returnInData 

    Dim crlfstart As Long 
    Dim crlfend As Long 
    Dim chunksize As Long 

    'first, get header, which ends with 2 line feeds 
    crlfstart = InStr(Indata, vbCrLf & vbCrLf) 
    If crlfstart = 0 Then 
    'invalid http 
    UnChunk = Indata 
    Exit Function 
    End If 
    UnChunk = Left(Indata, crlfstart + 2) 

    'start looking for vbCrLf 
    crlfstart = InStr(crlfstart + 2, Indata, vbCrLf) 
    Do While crlfstart > 0 
    'find the next vbCrLf 
    crlfend = InStr(crlfstart + 1, Indata, vbCrLf) 

    If crlfend > crlfstart And crlfend - crlfstart < 10 Then 
     'convert the HEX string to the chunksize 
     chunksize = Val("&h" & Mid(Indata, crlfstart + 2, crlfend - (crlfstart + 2))) 
     'by spec, if 0 then no more data 
     If chunksize > 0 Then 
     'there's more data 
     'this should be unnecessary, but one more test 
     If Mid(Indata, crlfend + 2 + chunksize, 2) = vbCrLf Then 
      UnChunk = UnChunk & Mid(Indata, crlfend + 2, chunksize) 
     Else 
      'oops, failed 
      Exit Do 
     End If 
     Else 
     'there's no more data so return what we have 
     Exit Function 
     End If 
    End If 
    'look again 
    crlfstart = InStr(crlfstart + 1, Indata, vbCrLf) 
    Loop 
    'just in case this fails, return the input data 
returnInData: 
    UnChunk = Indata 
End Function