写了个大众点评的爬虫脚本, 分享下源码
来源:http://bbs.51cto.com/thread-1400050-1.html预览源代码
打印
001var keywords = "黄焖鸡米饭";
002var scanUrls = ["http://www.dianping.com/search/keyword/1/0_"+keywords];
003
004var configs = {
005 domains: ["dianping.com"],
006 scanUrls: scanUrls,
007 helperUrlRegexes: ["http://www.dianping.com/search/keyword/\\d+/0_.*"],
008 contentUrlRegexes: ["http://www.dianping.com/shop/\\d+/editmember"],
009 enableProxy: true,
010 interval: 5000,
011 fields: [
012 {
013 name: "shop_name",
014 selector: "//div/div/h3/a/text()"
015 },
016 {
017 name: "id",
018 selector: "//div/div/h3/a/<a href="http://home.51cto.com/index.php?s=/space/9064958" target="_blank">@href</a>"
019 },
020 {
021 name: "create_time",
022 selector: "//div/ul/li/span"
023 },
024 {
025 name: "region_name",
026 selector: "//div[@class=\'breadcrumb\']/b/a/span/text()",
027 required: true
028 },
029 {
030 name: "province_name",
031 selector: "//div[@class=\'breadcrumb\']/b/a/span/text()"
032 }
033 ]
034};
035
036configs.onProcessHelperUrl = function(url, content, site) {
037 var urls = extractList(content, "//div[@class=\'tit\']/a/<a href="http://home.51cto.com/index.php?s=/space/9064958" target="_blank">@href</a>");
038 for (var i = 0; i < urls.length; i++) {
039 site.addUrl(urls+"/editmember");
040 }
041 var nextPage = extract(content,"//div[@class=\'page\']/a[@class=\'next\']/<a href="http://home.51cto.com/index.php?s=/space/9064958" target="_blank">@href</a>");
042 if (nextPage) {
043 site.addUrl(nextPage);
044 var result = /\d+$/.exec(nextPage);
045 if (result) {
046 var data = result;
047 var count = nextPage.length-data.length;
048 var lll = nextPage.substr(0, count)+(parseInt(data)+1);
049 site.addUrl(nextPage.substr(0, count)+(parseInt(data)+1));
050 site.addUrl(nextPage.substr(0, count)+(parseInt(data)+2));
051 }
052 }
053 return false;
054}
055
056configs.afterExtractField = function(fieldName, data, page) {
057 if (fieldName == "id") {
058 var result = /\d+$/.exec(data);
059 if (result) {
060 data = result;
061 }
062 }
063 else if (fieldName == "shop_name") {
064 if (data.indexOf("黄焖鸡米饭") == -1) {
065 page.skip();
066 }
067 }
068 else if (fieldName == "create_time") {
069 var result = /\d{2}-\d{2}-\d{2}$/.exec(data);
070 data = "20"+result;
071 }
072 else if (fieldName == "province_name" || fieldName == "region_name") {
073 var position = data.indexOf("县");
074 if (position != -1 && position < data.length -1) {
075 data = data.substr(0,position+1);
076 }
077 position = data.indexOf("市");
078 if (position != -1 && position < data.length -1) {
079 data = data.substr(0,position+1);
080 }
081 data = data.replace("餐厅","");
082 if (fieldName == "province_name") {
083 data = getProvinceNameByRegion(data);
084 }
085 }
086 return data;
087}
088
089configs.nextScanUrl = function(url) {
090 var num = /\/(\d+)\//.exec(url);
091 if (num && num < 2323) {
092 num++;
093 return "http://www.dianping.com/search/keyword/"+num+"/0_"+keywords;
094 }
095 else {
096 return null;
097 }
098}
099
100var crawler = new Crawler(configs);
101crawler.start();
爬虫脚本可以在神箭云爬虫框架上运行,对爬虫有兴趣的可以试试,欢迎来交流~
楼主牛 学习学习 高手啊,牛逼 学习了。。顺便薅羊毛
页:
[1]