Hi. I would like to build a web crawler, but I am only familiar with C and C++. Can any one give tips on how to develop one using those two environments. I have a Borland 4.5 compiler that I will be using.

Recommended Answers

All 2 Replies

You need a library that can support HTTP. I've never used it, but libcurl sound's like it will do the job.

You can use MFC
Example

#include <WinInet.h>
#pragma comment(lib, "wininet")

class CHTTP
{
public:
	CHTTP(LPCTSTR lpszServerName, LPCTSTR lpszTarget, int Method, LPCTSTR lpszFileName);
	CHTTP();
	~CHTTP();

public:
	bool OpenInet(LPCTSTR lpszAgent);
	// creating connection lpszServerName hostname e.g. "google.com"
	bool Connection(LPCTSTR lpszServerName);

	void CloseOpenedInet();
	void CloseConection();
	bool CheckError(bool bTest);

	DWORD m_dwLastError;
	HINTERNET m_hInternet;
	HINTERNET m_hConnection;
	HINTERNET m_hRequest;

	void CloseRequest();

	// sending request
	bool SendRequest(LPCTSTR lpszVerb, LPCTSTR lpszTarget);

	// GET query
	bool Get(LPCTSTR lpszTarget);

	// POST query
	bool Post(LPCTSTR lpszTarget);

	// write answer into file
	bool WriteRespToFile(LPCTSTR FileName);
	CFile *m_lpFile;
};

and implementation

#include "StdAfx.h"
#include "HTTPWork.h"

CHTTP::CHTTP(LPCTSTR lpszServerName, LPCTSTR lpszTarget, int Method, LPCTSTR lpszFileName)
: m_dwLastError(0)
, m_hInternet(NULL)
, m_hConnection(NULL)
, m_hRequest(NULL)
, m_lpFile(NULL)
{
	OpenInet(TEXT(""));
	Connection(lpszServerName);
	if (Method == 1)
	{
		Post(lpszTarget);
		WriteRespToFile(lpszFileName);
	}
	else
	{
		Get(lpszTarget);
		WriteRespToFile(lpszFileName);
	}
}

CHTTP::CHTTP()
: m_dwLastError(0)
, m_hInternet(NULL)
, m_hConnection(NULL)
, m_hRequest(NULL)
, m_lpFile(NULL)
{
}

CHTTP::~CHTTP()
{
	CloseRequest();
	CloseConection();
	CloseOpenedInet();
}

bool CHTTP::CheckError(bool bTest)
{
	if (bTest == false)
	{
		m_dwLastError = ::GetLastError();
	}
	return bTest;
}

bool CHTTP::OpenInet(LPCTSTR lpszAgent)
{
	if (m_hInternet == NULL)
	{
		m_hInternet = ::InternetOpen(lpszAgent,INTERNET_OPEN_TYPE_PRECONFIG,NULL,NULL,0);
	}
	return CheckError(m_hInternet != NULL);
}

bool CHTTP::Connection(LPCTSTR lpszServerName)
{
	CloseConection();
	if (m_hConnection == NULL)
	{
		m_hConnection = ::InternetConnect(m_hInternet,lpszServerName,INTERNET_DEFAULT_HTTP_PORT,NULL,NULL,INTERNET_SERVICE_HTTP,0,1);
	}
	return CheckError(m_hConnection != NULL);
}

void CHTTP::CloseOpenedInet()
{
	if (m_hInternet)
	{
		::InternetCloseHandle(m_hInternet);
	}
	m_hInternet = NULL;
}

void CHTTP::CloseConection()
{
	if (m_hConnection)
	{
		::InternetCloseHandle(m_hConnection);
	}
	m_hConnection = NULL;
}

void CHTTP::CloseRequest()
{
	if (m_hRequest)
	{
		::InternetCloseHandle(m_hRequest);
	}
	m_hRequest = NULL;
}

bool CHTTP::SendRequest(LPCTSTR lpszVerb, LPCTSTR lpszTarget)
{
	if (m_hConnection)
	{
		CloseRequest();
		m_hRequest = ::HttpOpenRequest(m_hConnection,lpszVerb,lpszTarget,NULL,NULL,NULL,INTERNET_FLAG_KEEP_CONNECTION,1);
	}

	if (m_hRequest)
	{
		BOOL bSendRequest = ::HttpSendRequest(m_hRequest,NULL,0,NULL,0);
		if (!bSendRequest)
		{
			CloseRequest();
			CloseConection();
			CloseOpenedInet();
		}
	}
	return CheckError(m_hRequest != NULL);
}

bool CHTTP::Get(LPCTSTR lpszTarget)
{
	return SendRequest(TEXT("GET"),lpszTarget);
}

bool CHTTP::Post(LPCTSTR lpszTarget)
{
	return SendRequest(TEXT("POST"),lpszTarget);
}

bool CHTTP::WriteRespToFile(LPCTSTR FileName)
{
	if (m_hRequest)
	{
		char Data[2048];
		DWORD dwBytesRead = 0;
		
		BOOL bReadResponse = ::InternetReadFile(m_hRequest, Data, sizeof(Data)-1, &dwBytesRead);
		if (bReadResponse)
		{
			m_lpFile = new CFile();
			m_lpFile->Open(FileName, CFile::modeCreate | CFile::modeWrite);
			while (dwBytesRead != 0)
			{
				m_lpFile->Write(Data, dwBytesRead);
				bReadResponse = ::InternetReadFile(m_hRequest, Data, sizeof(Data)-1, &dwBytesRead);
			}
			m_lpFile->Close();
			delete m_lpFile;
		}
		else
		{
			return false;
		}
	}
	return CheckError(m_hRequest != NULL);
}

or use sockets.

Be a part of the DaniWeb community

We're a friendly, industry-focused community of developers, IT pros, digital marketers, and technology enthusiasts meeting, networking, learning, and sharing knowledge.