Simple-Web-Crawler-master
所属分类:搜索引擎
开发工具:C#
文件大小:797KB
下载次数:2
上传日期:2021-03-13 13:28:55
上 传 者:
shuangna
说明: C# 抓取网页数据功能示例代码(爬虫)
(C # grab web data sample code (crawler))
文件列表:
.vs (0, 2017-05-22)
.vs\SimpleCrawler (0, 2017-05-22)
.vs\SimpleCrawler\v14 (0, 2017-05-22)
.vs\SimpleCrawler\v14\.suo (70656, 2017-05-24)
SimpleCrawler.sln (1029, 2016-07-08)
Wesley.Crawler.SimpleCrawler (0, 2017-05-24)
Wesley.Crawler.SimpleCrawler\App.config (1612, 2017-05-23)
Wesley.Crawler.SimpleCrawler\Ctrip.cs (2223, 2017-05-24)
Wesley.Crawler.SimpleCrawler\DbHelperOra.cs (18375, 2017-05-23)
Wesley.Crawler.SimpleCrawler\Events (0, 2017-05-22)
Wesley.Crawler.SimpleCrawler\Events\OnCompletedEventArgs.cs (840, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Events\OnErrorEventArgs.cs (445, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Events\OnStartEventArgs.cs (425, 2016-07-08)
Wesley.Crawler.SimpleCrawler\HtmlTag.cs (16209, 2017-05-23)
Wesley.Crawler.SimpleCrawler\HttpRequestUtility.cs (23796, 2017-05-23)
Wesley.Crawler.SimpleCrawler\ICrawler.cs (547, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Images (0, 2017-05-22)
Wesley.Crawler.SimpleCrawler\Images\1.携程网城市列表.png (466822, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Images\2.抓取网页源代码.png (59213, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Images\3.使用正则清洗数据.png (58956, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Images\4.抓取城市下的酒店列表.png (70128, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Images\5.并发抓取示例.png (37250, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Models (0, 2017-05-22)
Wesley.Crawler.SimpleCrawler\Models\City.cs (287, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Models\Hotel.cs (335, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Program.cs (10361, 2017-05-23)
Wesley.Crawler.SimpleCrawler\Properties (0, 2017-05-22)
Wesley.Crawler.SimpleCrawler\Properties\AssemblyInfo.cs (1328, 2016-07-08)
Wesley.Crawler.SimpleCrawler\SimpleCrawler.cs (5385, 2016-07-08)
Wesley.Crawler.SimpleCrawler\Wesley.Crawler.SimpleCrawler.csproj (3454, 2017-05-23)
Wesley.Crawler.SimpleCrawler\bin (0, 2017-05-22)
Wesley.Crawler.SimpleCrawler\bin\Debug (0, 2017-05-24)
Wesley.Crawler.SimpleCrawler\bin\Debug\Wesley.Crawler.SimpleCrawler.exe (39424, 2017-05-24)
Wesley.Crawler.SimpleCrawler\bin\Debug\Wesley.Crawler.SimpleCrawler.exe.config (1612, 2017-05-23)
Wesley.Crawler.SimpleCrawler\bin\Debug\Wesley.Crawler.SimpleCrawler.pdb (89600, 2017-05-24)
Wesley.Crawler.SimpleCrawler\bin\Debug\Wesley.Crawler.SimpleCrawler.vshost.exe (22696, 2017-05-24)
... ...
# 简单且高效的网站爬虫
基于C#.NET的简单网页爬虫,支持异步并发、设置***、操作Cookie、Gzip页面加速。
今日头条@全栈解密:[查看完整教程](http://toutiao.com/a6304503113106555138/ "今日头条@全栈解密")
### 主要特性
- 支持Gzip根据网页内容自动解压,加快爬虫载入速度;
- 支持异步并发抓取;
- 支持自动事件通知;
- 支持***切换;
- 支持操作Cookies;
### 运行截图
- 抓取城市列表
![使用正则表达式清洗数据](https://github.com/coldicelion/Simple-Web-Crawler/blob/master/Wesley.Crawler.SimpleCrawler/Images/3.%E4%BD%BF%E7%94%A8%E6%AD%A3%E5%88%99%E6%B8%85%E6%B4%97%E6%95%B0%E6%8D%AE.png?raw=true)
- 抓取酒店列表
![抓取城市下的酒店列表](https://github.com/coldicelion/Simple-Web-Crawler/blob/master/Wesley.Crawler.SimpleCrawler/Images/4.%E6%8A%93%E5%8F%96%E5%9F%8E%E5%B8%82%E4%B8%8B%E7%9A%84%E9%85%92%E5%BA%97%E5%88%97%E8%A1%A8.png?raw=true)
### 示例代码
///
/// 抓取城市列表
///
public static void CityCrawler() {
var cityUrl = "http://hotels.ctrip.com/citylist";//定义爬虫入口URL
var cityList = new List
();//定义泛型列表存放城市名称及对应的酒店URL
var cityCrawler = new SimpleCrawler();//调用刚才写的爬虫程序
cityCrawler.OnStart += (s, e) =>
{
Console.WriteLine("爬虫开始抓取地址:" + e.Uri.ToString());
};
cityCrawler.OnError += (s, e) =>
{
Console.WriteLine("爬虫抓取出现错误:" + e.Uri.ToString() + ",异常消息:" + e.Exception.Message);
};
cityCrawler.OnCompleted += (s, e) =>
{
//使用正则表达式清洗网页源代码中的数据
var links = Regex.Matches(e.PageSource, @"]+href=""*(?/hotel/[^>\s]+)""\s*[^>]*>(?(?!.*img).*?)", RegexOptions.IgnoreCase);
foreach (Match match in links)
{
var city = new City
{
CityName = match.Groups["text"].Value,
Uri = new Uri("http://hotels.ctrip.com" + match.Groups["href"].Value
)
};
if (!cityList.Contains(city)) cityList.Add(city);//将数据加入到泛型列表
Console.WriteLine(city.CityName + "|" + city.Uri);//将城市名称及URL显示到控制台
}
Console.WriteLine("===============================================");
Console.WriteLine("爬虫抓取任务完成!合计 " + links.Count + " 个城市。");
Console.WriteLine("耗时:" + e.Milliseconds + "毫秒");
Console.WriteLine("线程:" + e.ThreadId);
Console.WriteLine("地址:" + e.Uri.ToString());
};
cityCrawler.Start(new Uri(cityUrl)).Wait();//没被封锁就别使用***:60.221.50.118:8090
}
### 技术探讨/联系方式
- QQ号: 276679490
- 爬虫架构讨论群:180085853
近期下载者:
相关文件:
收藏者: