Introduction
from bs4 import BeautifulSoup as bs
import requests
webpage_response = requests.get('https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/shellter.html')
webpage = webpage_response.content
soup = bs(requests.get(webpage, 'html.parser').content)
Access
print(soup.div.name)
print(soup.div.attrs)
print(soup.div)
print(soup.div.string)
for child in soup.div.parents:
print(child)
Find All
arr = soup.find_all("a")
arr2 = soup.find_all(["p", "h1"])
arr3 = soup.find_all(attrs={'class':'banner', 'id':'jumbotron'})
def func(tag):
return tag.attr('class') == "banner" and tag.string == "Hello world"
soup.find_all(func)
Select
webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")
turtle_links = soup.find_all("a")
links = []
#go through all of the a tags and get the links associated with them:
for a in turtle_links:
links.append(prefix+a["href"])
#Define turtle_data:
turtle_data = {}
#follow each link:
for link in links:
webpage = requests.get(link)
turtle = BeautifulSoup(webpage.content, "html.parser")
turtle_name = turtle.select(".name")[0]
turtle_data[turtle_name.get_text()] = [turtle.find("ul").get_text("|").split("|")]
print(turtle_data)
turtle_df = pd.DataFrame.from_dict(turtle_data, orient = 'index')
print(turtle_df)