BeautifulSoup
is a Python library for pulling data out of HTML and XML files.
This article explains how to install and use BeautifulSoup
library in Python and how to extract
HTML tags using BeautifulSoup
.
Install BeautifulSoup
.
pip install beautifulsoup4
Import BeautifulSoup
and create Soup
object.
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
Below is the test HTML doccument that we would be using as an exmaple
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
and they lived at the bottom of a well.</p>
<p class="story">some sample data
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
<a href="http://example.com/samplelink" class="sister" id="link2">Tillie</a>
<a href="http://example.com/samplelink2" class="sister" id="link2">Tillie</a>
some dummy data
</p>
</body>
</html>
"""
Extract document title.
# Extract title and title text
print(soup.title)
print(soup.title.string)
Extract all <p> tags.
# Extract all p tags
p_tags = soup.find_all("p")
print(p_tags)
Extract <p> tag with class name "story".
# Extract p tag with class name "story"
print(soup.find_all("p",attrs={'class':"story"}))
Extract all <a> tags
# Extract all a tags
a_tags = soup.find_all("a")
print(a_tags)
Extract all urls from <a> tags.
# Extract all urls from a tags
for a in a_tags:
print(a.get('href'))
Extract all elements with id link2.
# Extract all elements with id link2
print(soup.find_all(id="link2"))
Extract all text from the from the document.
# Extract all text from the from the document
print(soup.get_text())
Comple code snippet
from bs4 import BeautifulSoup
html_doc = """<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
and they lived at the bottom of a well.</p>
<p class="story">some sample data
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
and they lived at the bottom of a well.
<a href="http://example.com/samplelink" class="sister" id="link2">Tillie</a>
<a href="http://example.com/samplelink2" class="sister" id="link2">Tillie</a>
some dummy data
</p>
</body>
</html>
"""
soup = BeautifulSoup(html_doc, 'html.parser')
# Extract title and title text
print(soup.title)
print(soup.title.string)
# Extract all p tags
p_tags = soup.find_all("p")
print(p_tags)
# Extract p tag with class name "story"
print(soup.find_all("p",attrs={'class':"story"}))
# Extract all a tags
a_tags = soup.find_all("a")
print(a_tags)
# Extract all urls from a tags
for a in a_tags:
print(a.get('href'))
# Extract all elements with id link2
print(soup.find_all(id="link2"))
# Extract all text from the from the document
print(soup.get_text())
Similar Articles