我自己写的爬虫程序
ref class TieZi
{
public:
// 楼层
int nID;
// 发帖人
String^ UserName;
// 发帖的时间
DateTime^ dataTime;
// 帖子的内容
String^ contentText;
};
ref class SendTieZi
{
public:
// 发帖人
String^ UserName;
// 所有的帖子
List<TieZi^>^ arrTieZi;
};
//添加一个帖子到所有帖子列表中
void AddTieZi(List<SendTieZi^>^ arrAllUserTieZi,TieZi^ tiezi)
{
if (tiezi == nullptr || tiezi->UserName->Equals(""))
{
return;
}
for each(SendTieZi^ sendTiezi in arrAllUserTieZi)
{
if (sendTiezi->UserName->Equals(tiezi->UserName))
{
sendTiezi->arrTieZi->Add(tiezi);
return;
}
}
SendTieZi^ newUser = gcnew SendTieZi;
newUser->UserName = tiezi->UserName;
newUser->arrTieZi = gcnew List<TieZi^>();
arrAllUserTieZi->Add(newUser);
}
//添加一个页面的所有帖子到列表中
void AddWebAllTieZi(List<SendTieZi^>^ arrAllUserTieZi,String^ webUrl,int nPos)
{
WebClient^ webClient = gcnew WebClient;
// 下载页面内容
String^ webText = Encoding::UTF8->GetString(webClient->DownloadData(webUrl));
Regex^ regexMatchName = gcnew Regex("<div\\s*id=\"memberinfo_(?<id>\\d+)[^<>]*>\\s*<[^<>]*>\\s*(?<name>[^<>]+)\\s*</div>",RegexOptions::IgnoreCase);
MatchCollection^ matchs = regexMatchName->Matches(webText);
for each(Match^ m in matchs)
{
TieZi^ tiezi = gcnew TieZi;
tiezi->UserName = m->Groups["name"]->Value->Trim();
tiezi->nID = nPos*10+Convert::ToInt32(m->Groups["id"]->Value->Trim());
AddTieZi(arrAllUserTieZi,tiezi);
}
}
void GetAll()
{
List<SendTieZi^>^ arrAllUserTieZi = gcnew List<SendTieZi^>();
for (int i = 1 ; i < 850 ; i++ )
{
String^ strWebUrl = String::Format("http://bbs.ikaka.com/showtopic-8685806-{0}.aspx",i);
Console::Clear();
Console::WriteLine("正在处理第{0}页......",i);
AddWebAllTieZi(arrAllUserTieZi,strWebUrl,i);
}
for each(SendTieZi^ sendTiezi in arrAllUserTieZi)
{
Console::WriteLine(sendTiezi->UserName + "\t" + sendTiezi->arrTieZi->Count);
}
}