Web Scrapping

Created
Tags

Introduction

from bs4 import BeautifulSoup as bs
import requests

webpage_response = requests.get('https://s3.amazonaws.com/codecademy-content/courses/beautifulsoup/shellter.html')

webpage = webpage_response.content

soup = bs(requests.get(webpage, 'html.parser').content)

Access

print(soup.div.name)
print(soup.div.attrs)

print(soup.div)
print(soup.div.string)


for child in soup.div.parents:
  print(child)

Find All

arr = soup.find_all("a")
arr2 = soup.find_all(["p", "h1"])
arr3 = soup.find_all(attrs={'class':'banner', 'id':'jumbotron'})

def func(tag):
    return tag.attr('class') == "banner" and tag.string == "Hello world"

soup.find_all(func)

Select

webpage = webpage_response.content
soup = BeautifulSoup(webpage, "html.parser")

turtle_links = soup.find_all("a")
links = []
#go through all of the a tags and get the links associated with them:
for a in turtle_links:
  links.append(prefix+a["href"])
    
#Define turtle_data:
turtle_data = {}

#follow each link:
for link in links:
  webpage = requests.get(link)
  turtle = BeautifulSoup(webpage.content, "html.parser")
  turtle_name = turtle.select(".name")[0]
  turtle_data[turtle_name.get_text()] = [turtle.find("ul").get_text("|").split("|")]

print(turtle_data)
turtle_df = pd.DataFrame.from_dict(turtle_data, orient = 'index')

print(turtle_df)