scrapy- follow the links
from scrapy.spiders import CrawlSpider
class SuperSpider(CrawlSpider):
name = 'follower'
allowed_domains = ['en.wikipedia.org']
start_urls = ['https://en.wikipedia.org/wiki/Web_scraping']
base_url = 'https://en.wikipedia.org'
custom_settings = {
'DEPTH_LIMIT': 1
}
def parse(self, response):
for next_page in response.xpath('.//div/p/a'):
yield response.follow(next_page, self.parse)
for quote in response.xpath('.//h1/text()'):
yield {'quote': quote.extract() }