From Red Teal, 1 Year ago, written in Python.
Embed
  1. import scrapy
  2.  
  3. class WiesoWeshalbWarumSpider(scrapy.Spider):
  4.     name = "wieso"
  5.     start_urls = [
  6.         'https://www.ravensburger.de/produkte/buecher/wieso-weshalb-warum/index.html'
  7.     ]
  8.  
  9.     def parse(self, response):
  10.         for productHref in response.css("div.products div.headline a::attr(href)"):
  11.             yield response.follow(productHref, self.parse_product)
  12.            
  13.         for smallProductImgSrc in response.css("div.products div.upper img::attr(src)").extract():
  14.             yield scrapy.Request(url=smallProductImgSrc, callback=self.save_img)
  15.        
  16.         nextPage = response.selector.xpath('//nav[@aria-label="Page navigation"]//a[@aria-label="Next"]/@href').extract_first()
  17.         if nextPage is not None:
  18.             yield response.follow(nextPage, self.parse)
  19.            
  20.     def parse_product(self, response):
  21.         def extract_with_css(query):
  22.             return response.css(query).extract_first().strip()
  23.        
  24.         yield {
  25.             'reihe': response.selector.xpath('//div[@class="specificationContainer"]/div/a[contains(@href, "reihe")]/text()').extract_first(),
  26.             'titel': extract_with_css("div.productPriceContainer h1::text"),
  27.             'reihennummer': response.selector.xpath('//div[@class="specificationContainer"]/div/text()').re(r'\s*(.*)')[18],
  28.             'isbn': response.selector.xpath('//div[@class="specificationContainer"]/div/text()').re(r'\s*(.*)')[2],
  29.             'beschreibung': extract_with_css("div.leadIn::text"),
  30.             'weitereBeschreibung': response.selector.xpath('//div[@class="descContainer"]/div/text()').re(r'\s*(.*)')[2],
  31.             'autor': response.selector.xpath('//div[@class="specificationContainer"]/div/a[contains(@href, "author")]/text()').extract()
  32.         }
  33.        
  34.         yield scrapy.Request(url=response.css("a.ilightboxBigImage::attr(href)").extract_first(), callback=self.save_img)
  35.    
  36.     def save_img(self, response):
  37.         path = "img/" + response.url.split('/')[-2] + "-" + response.url.split('/')[-1]
  38.         self.logger.info('Saving image %s', path)
  39.         with open(path, 'wb') as f:
  40.             f.write(response.body)