Good evening!
I used Scrapy to crawl some musics from this site: http://www.vagalume.com.br/ (here vagalume.json was generated)
The idea now is to crawl the same musics I crawled from the site above in this another site: https://www.letras.mus.br
I tried to read the data from vagalume.json and search each music in the site above, but the div from xpath returns empty.
I think the reason for that is that the spider finishes to read the search page before it returns the query from the server. I'm not sure though. What can I do about it?
Here is the code (the current parse method I was using for debug):
import scrapy
class MusicItem(scrapy.Item):
name = scrapy.Field()
author = scrapy.Field()
lyrics = scrapy.Field()
Above is the item. Here is the spider:
import scrapy
import json
from Letras.items import MusicItem
class LetrasSpider(scrapy.Spider):
name = "letras"
allowed_domains = ["letras.mus.br"]
start_urls = [
"https://www.letras.mus.br/?q=peter%20hollens%20misty%20mountains"
]
def cleanString(self, text):
txt = ""
for c in text:
if c.isalnum():
txt += c
else:
if c.isspace():
txt += ' '
return txt
def retrieveLyrics(self, response):
lyrics = ""
for sel in response:
sentence = sel.extract()
lyrics += self.cleanString(sentence)
lyrics += ' '
return lyrics
def retrieveMusicName(self, response):
return self.cleanString(response.xpath('h1/text()')[0].extract())
def retrieveAuthor(self, response):
return self.cleanString(response.xpath('h2/a/text()')[0].extract())
def parseOneMusic(self, response):
lyrics = self.retrieveLyrics(response.xpath('//div[@class="g-pr g-sp"]/div[@class="cnt-letra p402_premium"]/article/p/text()'))
sel = response.xpath('//div[@class="cnt-head cnt-head--l"]/div[@class="cnt-head_title"]')
name = self.retrieveMusicName(sel)
author = self.retrieveAuthor(sel)
item = MusicItem()
item['name'] = name
item['author'] = author
item['lyrics'] = lyrics
yield item
def parseOneAuthor(self, response):
for href in response.xpath('//ul[@class="cnt-list"]/li/a[1]/@href'):
url = response.urljoin(href.extract())
yield scrapy.Request(url, callback=self.parseOneMusic)
def parseQuery(self, response):
url = response.css('.gsc-expansionArea > div:nth-child(1) > div:nth-child(1) > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > div:nth-child(1) > a:nth-child(1)').extract()
self.logger.info("ParseQuery url = {0}".format(url))
return scrapy.Request(url, callback=self.parseOneMusic)
def treatAuthorName(self, authorName):
return authorName.lower().replace(" ", "-")
def parse(self, response):
for href in response.xpath('//div[@class="wrapper"]/div[@id="all"]/div[@id="cnt_top"]/div[@id="res_busca"]/div[@id="resultado"]/div[@class="all"]/div[@id="cse-search-results"]/div'):
self.logger.info("LoggingParse href = {0}n".format(href))
def parse2(self, response):
with open('vagalume.json') as vagalume_file:
vagalumeJson = json.load(vagalume_file)
for vagalumeItem in vagalumeJson:
url = "https://www.letras.mus.br/?q={0} {1}".format(vagalumeItem["author"], vagalumeItem["name"])
yield scrapy.Request(url, callback=self.parseQuery)
Aucun commentaire:
Enregistrer un commentaire