# -*- coding: utf-8 -*-
from scrapy.contrib.spiders import CrawlSpider, Rule
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.spider import BaseSpider
from scrapy.selector import HtmlXPathSelector
from scrapy.http.request import Request
import urlparse
from csdn.items import CsdnItem
class csdnSpider( CrawlSpider ):
name = "csdn"
allowed_domains = [“www.csdn.net”]
start_urls = [
"http://blog.csdn.net/huangxiansheng1980/article/list/1",
"http://blog.csdn.net/huangxiansheng1980/article/list/2",
"http://blog.csdn.net/huangxiansheng1980/article/list/3",
"http://blog.csdn.net/huangxiansheng1980/article/list/4",
"http://blog.csdn.net/huangxiansheng1980/article/list/5",
"http://blog.csdn.net/huangxiansheng1980/article/list/6",
"http://blog.csdn.net/huangxiansheng1980/article/list/7",
"http://blog.csdn.net/huangxiansheng1980/article/list/8",
"http://blog.csdn.net/huangxiansheng1980/article/list/9",
"http://blog.csdn.net/huangxiansheng1980/article/list/10",
"http://blog.csdn.net/huangxiansheng1980/article/list/11",
"http://blog.csdn.net/huangxiansheng1980/article/list/12",
"http://blog.csdn.net/huangxiansheng1980/article/list/13",
"http://blog.csdn.net/huangxiansheng1980/article/list/14",
"http://blog.csdn.net/huangxiansheng1980/article/list/15",
"http://blog.csdn.net/huangxiansheng1980/article/list/16",
"http://blog.csdn.net/huangxiansheng1980/article/list/17",
"http://blog.csdn.net/huangxiansheng1980/article/list/18",
"http://blog.csdn.net/huangxiansheng1980/article/list/19",
"http://blog.csdn.net/huangxiansheng1980/article/list/20",
"http://blog.csdn.net/huangxiansheng1980/article/list/21",
"http://blog.csdn.net/huangxiansheng1980/article/list/22",
"http://blog.csdn.net/huangxiansheng1980/article/list/23",
"http://blog.csdn.net/huangxiansheng1980/article/list/24",
"http://blog.csdn.net/huangxiansheng1980/article/list/25",
"http://blog.csdn.net/huangxiansheng1980/article/list/26",
"http://blog.csdn.net/huangxiansheng1980/article/list/27",
"http://blog.csdn.net/huangxiansheng1980/article/list/28",
"http://blog.csdn.net/huangxiansheng1980/article/list/29",
"http://blog.csdn.net/huangxiansheng1980/article/list/30",
"http://blog.csdn.net/huangxiansheng1980/article/list/31",
"http://blog.csdn.net/huangxiansheng1980/article/list/32",
"http://blog.csdn.net/huangxiansheng1980/article/list/33",
"http://blog.csdn.net/huangxiansheng1980/article/list/34",
"http://blog.csdn.net/huangxiansheng1980/article/list/35",
"http://blog.csdn.net/huangxiansheng1980/article/list/36",
"http://blog.csdn.net/huangxiansheng1980/article/list/37",
"http://blog.csdn.net/huangxiansheng1980/article/list/38",
]
rules = (
Rule(SgmlLinkExtractor(allow=('details/12513065',
),), callback='parse_item', follow=True),
)
def parse_item( self, response ):
print '++++++++crawling ' + response.url
links = []
hxs = HtmlXPathSelector(response)
blog_titles = hxs.select('//h3/span/a/text()')
blog_links = hxs.select('//h3/span/a/@href')
#next_page_flags = hxs.select('//div[@id=”papelist”]/a/text()')
#titles = ''
#links = ''
#flags = ''
#for title in blog_titles:
# titles = titles + title.extract()
#for link in blog_links:
# print '——–crawling ' + urlparse.urljoin(response.url, link.extract())
# yield Request( urlparse.urljoin(response.url, link.extract()), meta={}, callback=self.parse_item )
#for flag in next_page_flags:
# flags = flags + flag.extract()
filename = response.url.split("/")[-2]
open( filename, 'wb' ).write( response.body )
#open( 'all', 'w' ).write( titles.encode( 'utf-8' ))
#open( 'all', 'a' ).write( links )
#open( 'all', 'a' ).write( flags.encode( 'utf-8'))
item = CsdnItem()
item.title = 'test'
item.content = 'fdfjdkf'
return item
spider = csdnSpider()
版权所有,禁止转载. 如需转载,请先征得博主的同意,并且表明文章出处,否则按侵权处理.