Linux下socket实现网页抓取
分类:
C/C++学习点滴
DO spiders DO
linux编程
2007-10-06 21:33
951人阅读
评论(0)
收藏
举报
主要用来和WinSock进行下比较:
--WinSock--
需要初始化:
if( (Ret = WSAStartup(MAKEWORD(1,1), &wsaData) ) != 0 )
{
printf("WSAStartup failed with error %d/n", Ret);
return FALSE;
}
头文件:
--WinSock--
#include <winsock2.h> //header
#pragma comment (lib, "ws2_32.lib") //lib
--Linux--
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
各个头文件的作用还需要进一步研究
gethostbyname(host)://从主机名返回地址
这个都是一样的,返回一个struct hostent *的指针。
地址结构:
--WinSock--
SOCKADDR_IN
--Linux--
sockaddr_in
实际上是一样的都是
struct sockaddr_in{
shortsin_family;
unsigned short sin_port;
struct in_addr sin_addr;
charsin_zero[8];
};
(
这个结构是sockaddr的等价结构
struct sockaddr
{
unsigned short sa_family; // address family, AF_XXX
char sa_data[14]; //14 bytes of protocol address
};
)
其中IP地址结构struct in_addr定义如下:
struct in_addr {
union {
struct{
unsigned char s_b1,
s_b2,
s_b3,
s_b4;
} S_un_b;
struct {
unsigned short s_w1,
s_w2;
} S_un_w;
unsigned long S_addr;
} S_un;
};
Socket:
--WinSock--
返回句柄SOCKET,就是socket描述符
--Linux--
比较直接返回int型socket描述符
函数接口都一样
函数例子:
socket (AF_INET, SOCK_STREAM, 0); //TCP
connect(sock, (const sockaddr * )&tcpaddr, sizeof(tcpaddr)); //返回值有不同
--WinSock--
If no error occurs, connect returns zero. Otherwise, it returns SOCKET_ERROR, and a specific error code can be retrieved by calling
WSAGetLastError.
--Linux--
错误返回-1
send(sock_description, message, strlen(message), 0); //返回值不同
--WinSock--
If no error occurs, send returns the total number of bytes sent, which can be less than the number indicated by len. Otherwise, a value of
SOCKET_ERROR is returned, and a specific error code can be retrieved by calling WSAGetLastError.
--Linux--
错误返回-1
recv(sock_description, buffer, sizeof(buffer), 0);//返回值不同
--WinSock--
If no error occurs, recv returns the number of bytes received. If the connection has been gracefully closed, the return value is zero. Otherwise, a
value of SOCKET_ERROR is returned, and a specific error code can be retrieved by calling WSAGetLastError.
--Linux--
错误返回-1
结束:
--WinSock--
closesocket(sock);
if( WSACleanup() == SOCKET_ERROR )
{
printf("WSACleanup failed with error %d /n", WSAGetLastError() );
}
--Linux--
close(sock);
下面是一个Linux下socket一个HTTP协议GET方法的应用:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <arpa/inet.h>
#include <netdb.h>
char*host = "www.hao123.com";
intport = 80;
int main(void)
{
char buffer[512];
int isock;
struct sockaddr_in pin;
struct hostent * remoteHost;
char message[512];
int done = 0;
int chars = 0;
int l = 0;
if( (remoteHost = gethostbyname(host)) == 0 )
{
printf("Error resolving host/n");
exit(1);
}
bzero(message,sizeof(message));
bzero(&pin,sizeof(pin));
pin.sin_family = AF_INET;
pin.sin_port = htons(port);
pin.sin_addr.s_addr = ( (struct in_addr *)(remoteHost->h_addr) )->s_addr;
if( (isock = socket(AF_INET, SOCK_STREAM, 0)) == -1)
{
printf("Error opening socket!/n");
exit(1);
}
sprintf(message, "GET / HTTP/1.1/r/n");
strcat(message, "Host:www.hao123.com/r/n");
strcat(message, "Accept: */*/r/n");
strcat(message, "User-Agent: Mozilla/4.0(compatible)/r/n");
strcat(message, "connection:Keep-Alive/r/n");
strcat(message, "/r/n/r/n");
printf("%s",message);
if( connect(isock, (void *)&pin, sizeof(pin)) == -1 )
{
printf("Error connecting to socket/n");
exit(1);
}
if( send(isock, message, strlen(message), 0) == -1)
{
printf("Error in send/n");
exit(1);
}
while(done == 0)
{
l = recv(isock, buffer, 1, 0);
if( l < 0 )
done = 1;
switch(*buffer)
{
case '/r':
break;
case '/n':
if(chars == 0)
done = 1;
chars = 0;
break;
default:
chars++;
break;
}
printf("%c",*buffer);
}
do
{
l = recv(isock, buffer, sizeof(buffer) - 1, 0);
if( l < 0 )
break;
*(buffer + l) = 0;
fputs(buffer, stdout);
}while( l > 0 );
close(isock);
return 0;
}