lundi 27 juin 2016

scrapy query database not return empty xpath

Good evening!

I used Scrapy to crawl some musics from this site: http://www.vagalume.com.br/ (here vagalume.json was generated)

The idea now is to crawl the same musics I crawled from the site above in this another site: https://www.letras.mus.br

I tried to read the data from vagalume.json and search each music in the site above, but the div from xpath returns empty.

I think the reason for that is that the spider finishes to read the search page before it returns the query from the server. I'm not sure though. What can I do about it?

Here is the code (the current parse method I was using for debug):

import scrapy


class MusicItem(scrapy.Item):
    name = scrapy.Field()
    author = scrapy.Field()
    lyrics = scrapy.Field()

Above is the item. Here is the spider:

import scrapy
import json

from Letras.items import MusicItem

class LetrasSpider(scrapy.Spider):
    name = "letras"
    allowed_domains = ["letras.mus.br"]
    start_urls = [
        "https://www.letras.mus.br/?q=peter%20hollens%20misty%20mountains"
    ]

    def cleanString(self, text):
        txt = ""

        for c in text:
            if c.isalnum():
                txt += c
            else:
                if c.isspace():
                    txt += ' '

        return txt

    def retrieveLyrics(self, response):
        lyrics = ""

        for sel in response:
            sentence = sel.extract()
            lyrics += self.cleanString(sentence)
            lyrics += ' '

        return lyrics

    def retrieveMusicName(self, response):
        return self.cleanString(response.xpath('h1/text()')[0].extract())

    def retrieveAuthor(self, response):
        return self.cleanString(response.xpath('h2/a/text()')[0].extract())

    def parseOneMusic(self, response):
        lyrics = self.retrieveLyrics(response.xpath('//div[@class="g-pr g-sp"]/div[@class="cnt-letra p402_premium"]/article/p/text()'))
        sel = response.xpath('//div[@class="cnt-head cnt-head--l"]/div[@class="cnt-head_title"]')
        name = self.retrieveMusicName(sel)
        author = self.retrieveAuthor(sel)
        item = MusicItem()
        item['name'] = name
        item['author'] = author
        item['lyrics'] = lyrics

        yield item

    def parseOneAuthor(self, response):
        for href in response.xpath('//ul[@class="cnt-list"]/li/a[1]/@href'):
            url = response.urljoin(href.extract())
            yield scrapy.Request(url, callback=self.parseOneMusic)

    def parseQuery(self, response):
        url = response.css('.gsc-expansionArea > div:nth-child(1) > div:nth-child(1) > table:nth-child(3) > tbody:nth-child(1) > tr:nth-child(1) > td:nth-child(2) > div:nth-child(1) > a:nth-child(1)').extract()
        self.logger.info("ParseQuery url = {0}".format(url))
        return scrapy.Request(url, callback=self.parseOneMusic)

    def treatAuthorName(self, authorName):
        return authorName.lower().replace(" ", "-")

    def parse(self, response):
        for href in response.xpath('//div[@class="wrapper"]/div[@id="all"]/div[@id="cnt_top"]/div[@id="res_busca"]/div[@id="resultado"]/div[@class="all"]/div[@id="cse-search-results"]/div'):
            self.logger.info("LoggingParse href = {0}n".format(href))

    def parse2(self, response):
        with open('vagalume.json') as vagalume_file:
            vagalumeJson = json.load(vagalume_file)

            for vagalumeItem in vagalumeJson:
                url = "https://www.letras.mus.br/?q={0} {1}".format(vagalumeItem["author"], vagalumeItem["name"])
                yield scrapy.Request(url, callback=self.parseQuery)

Aucun commentaire:

Enregistrer un commentaire