dimanche 12 juin 2016

BeautifulSoup findAll HTML class with multiple variable class inputs

I have the following code which scrapes a website for divs with the class "odd" or "even". I'd like to make "odd" and "even" an argument my function takes in which would allow me to add other divs as well. Here is my code:

#
# Imports
#

import urllib2
from bs4 import BeautifulSoup
import re
import os
from pprint import pprint

#
# library
#

def get_soup(url):
    page = urllib2.urlopen(url)
    contents = page.read()
    soup = BeautifulSoup(contents, "html.parser")
    body = soup.findAll("tr", ["even", "odd"])
    string_list = str([i for i in body])
    return string_list


def save_to_file(path, soup):
    with open(path, 'w') as fhandle:
        fhandle.write(soup)


#
# script
#

def main():
    url = r'URL GOES HERE'
    path = os.path.join('PATH GOES HERE)
    the_soup = get_soup(url)
    save_to_file(path, the_soup)



if __name__ == '__main__':
    main()

I'd like to incorporate *args into the code. So the get_soup function would look like this:

def get_soup(url, *args):
    page = urllib2.urlopen(url)
    contents = page.read()
    soup = BeautifulSoup(contents, "html.parser")
    body = soup.findAll("tr", [args])
    string_list = str([i for i in body])
    return string_list

def main():
    url = r'URL GOES HERE'
    path = os.path.join('PATH GOES HERE)
    the_soup = get_soup(url, "odd", "even")
    save_to_file(path, the_soup)

Unfortunately, this isnt working. Ideas?

Aucun commentaire:

Enregistrer un commentaire