J Rogel
Data scientist, physicist, numerical analyst, machine learner, human
Structured in databases
Extracting data from a web page’s source
For example: http://isitweekendyet.com/
<!DOCTYPE html>
<html>
<head>
<title>Page Title</title>
</head>
<body>
<h1>My First Heading</h1>
<p>My first paragraph.</p>
</body>
</html>
# Python 3
from urllib.request import urlopen
url = 'http://isitweekendyet.com/'
pageSource = urlopen(url).read()
# Python 2
from urllib import urlopen
url = 'http://isitweekendyet.com/'
pageSource = urlopen(url).read()
from bs4 import BeautifulSoup
weekendSoup = BeautifulSoup(pageSource, 'lxml')
You didn't write that awful page.
You're just trying to get some data out of it.
Beautiful Soup is here to help.
Since 2004, it's been saving programmers hours or days of work on quick-turnaround screen scraping projects.http://www.crummy.com/software/BeautifulSoup/
>>> from bs4 import BeautifulSoup
>>> weekendSoup = BeautifulSoup(pageSource, 'lxml')
>>> weekendSoup.title
<title>Is it weekend yet?</title>
>>> weekendSoup.title.string
u'Is it weekend yet?'
>>> tag = weekendSoup.div
>>> tag
<div style="font-weight: bold; font-size: 120pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: black;">
YES!
</div>
>>> type(tag)
<class 'bs4.element.Tag'>
>>> tag.string
u'\nYES!\n'
>>> type(tag.string) <class 'bs4.element.NavigableString'>
>>> type(weekendSoup)
<class 'bs4.BeautifulSoup'>
>>> weekendSoup.name
u'[document]'
>>> markup = "<b><!--This is a very special message--></b>"
>>> cSoup = BeautifulSoup(markup)
>>> comment = cSoup.b.string
>>> type(comment)
<class 'bs4.element.Comment'>
>>> print(cSoup.b.prettify())
<b>
<!--This is a very special message-->
</b>
>>> bodyTag = weekendSoup.body
>>> bodyTag.contents
[u'\n', <div class="answer text" id="answer" style="font-weight: bold; font-size: 120pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: black;">
YES!
</div>, u'\n', <div style="font-size: 5pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: gray;">
<a href="http://theweekendhaslanded.org">The weekend has landed!</a>
</div>, u'\n']
<div class="answer text" id="answer" style="font-weight: bold; font-size: 120pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: black;">
YES!
</div>
<div style="font-size: 5pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: gray;">
<a href="http://theweekendhaslanded.org">The weekend has landed!</a>
</div>
>>> weekendSoup.div.string
u'\nYES!\n'
>>> for ss in weekednSoup.div.stripped_strings:
print(ss)
...
YES!
>>> for d in bodyTag.descendants: print d
...
<div class="answer text" id="answer" style="font-weight: bold; font-size: 120pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: black;">
YES!
</div>
YES!
<div style="font-size: 5pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: gray;">
<a href="http://theweekendhaslanded.org">The weekend has landed!</a>
</div>
<a href="http://theweekendhaslanded.org">The weekend has landed!</a>
The weekend has landed!
>>> weekendSoup.a.parent.name
u'div'
>>> for p in weekendSoup.a.parents: print p.name
...
div
body
html
[document]
>>> weekendSoup.div.next_sibling
u'\n'
>>> weekendSoup.div.next_sibling.next_sibling
<div style="font-size: 5pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: gray;">
<a href="http://theweekendhaslanded.org">The weekend has landed!</a>
</div>
write a script:
Use Beautiful Soup to navigate to the answer to our question:
Is it weekend yet?
'''A simple script that tells us if it's weekend yet'''
# import modules
from urllib.request import urlopen
from bs4 import BeautifulSoup
# open webpage
# parse HTML into Beautiful Soup
# extract data from parsed soup
Using a filter in a search function
to zoom into a part of the soup
most simple matching
>>> weekendSoup.find_all('div')
[<div style="font-weight: bold; font-size: 120pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: black;">
NO
</div>, <div style="font-size: 5pt; font-family: Helvetica Neue, Helvetica, Swis721 BT, Arial, sans-serif; text-decoration: none; color: gray;">
don't worry, you'll get there!
</div>]
besides using a String as argument in a search function, you can also use:
Regular Expression
List
True
Function
more details: http://www.crummy.com/software/BeautifulSoup/bs4/doc/#kinds-of-filters
>>> import re
>>> for tag in weekendSoup.find_all(re.compile("^b")):
... print(tag.name)
...
body
>>> weekendSoup.find_all(["a", "li"])
[<a href="http://theweekendhaslanded.org">The weekend has landed!</a>]
[<a href="http://theweekendhaslanded.org">The weekend has landed!</a>]
>>> for tag in weekendSoup.find_all(True):
... print(tag.name)
...
html
head
title
meta
body
div
div
a
>>> def has_class_but_no_id(tag):
return tag.has_attr('class') and not tag.has_attr('id')
>>> weekendSoup.find_all(has_class_but_no_id)
[]
careful to use "class_" when filtering based on class name
urlGA = 'https://gallery.generalassemb.ly/'
pageSourceGA = urlopen(urlGA).read()
GASoup = BeautifulSoup(pageSourceGA, 'lxml')
wdiLinks = GASoup.find_all('a', href='WD')
projects = GASoup.find_all('li', class_='project')
>>> soup.find_all('title', limit=1)
[<title> Monty Python's reunion is about nostalgia and heroes, not comedy | Stage | theguardian.com </title>]
>>> soup.find('title')
<title> Monty Python's reunion is about nostalgia and heroes, not comedy | Stage | theguardian.com </title>
How many projects on (the first page of) the GA Gallery are from San Francisco?
pro tip: use a search function
https://gallery.generalassemb.ly/
Bonus:
which are all the unique locations that projects have come from?
soup.select("#content")
soup.select("div#content")
soup.select(".byline")
soup.select("li.byline")
soup.select("#content a")
soup.select("#content > a")
soup.select('a[href]')
soup.select('a[href="http://www.theguardian.com/profile/brianlogan"]')
soup.select('a[href^="http://www.theguardian.com/"]')
soup.select('a[href$="info"]')
[<a class="link-text" href="http://www.theguardian.com/info">About us,</a>, <a class="link-text" href="http://www.theguardian.com/info">About us</a>]
>>> guardianSoup.select('a[href*=".com/contact"]')
[<a class="rollover contact-link" href="http://www.theguardian.com/contactus/2120188" title="Displays contact data for guardian.co.uk"><img alt="" class="trail-icon" src="http://static.guim.co.uk/static/ac46d0fc9b2bab67a9a8a8dd51cd8efdbc836fbf/common/images/icon-email-us.png"/><span>Contact us</span></a>]
We generally want to:
clean up
calculate
process
>>> answer = soup.div.string
>>> answer
'\nNO\n'
>>> cleaned = answer.strip()
>>> cleaned
'NO'
>>> isWeekendYet = cleaned == 'YES'
>>> isWeekendYet
False
# print info to screen
print('Is it weekend yet? ', isWeekendYet)
import csv
with open('weekend.csv', 'w', newline='') as csvfile:
weekendWriter = csv.writer(csvfile)
if isWeekendYet:
weekendWriter.writerow(['Yes'])
else:
weekendWriter.writerow(['No'])
'''A simple script that ... '''
# import modules
from urllib.request import urlopen
from bs4 import BeautifulSoup
# open webpage
url =
pageSource =
# parse HTML into Beautiful Soup
BitSoup =
# extract data from soup
# clean up data
# process data
# action based on data
google; “python” + your problem / question
python.org/doc/; official python documentation, useful to find which functions are available
stackoverflow.com; huge gamified help forum with discussions on all sorts of programming questions, answers are ranked by community
codecademy.com/tracks/python; interactive exercises that teach you coding by doing
wiki.python.org/moin/BeginnersGuide/Programmers; tools, lessons and tutorials
Python Usage Survey 2014 visualised
http://www.randalolson.com/2015/01/30/python-usage-survey-2014/
Python 2 & 3 Key Differences
'''
Bitcoin converter that tells us how much our bitcoins are worth in GBP
NB: expects python3
'''
# import modules
from urllib.request import urlopen
from bs4 import BeautifulSoup
def clean_up_rate(rateString):
''''Clean up raw rateString to form rate'''
rateNumber = rateString[1:10]
return float(rateNumber)
def main():
'''Our main function that gets called when we run the program'''
# open webpage
url = "http://bitcoinexchangerate.org/c/GBP/1"
webpage = urlopen(url).read()
# turn html into beautiful soup
bitcoinSoup = BeautifulSoup(webpage)
# extract info from soup
rateString = bitcoinSoup.find('b').string.strip()
# clean up data
rate = clean_up_rate(rateString)
# get user input
bitcoinsString = input("How many bitcoins have you got?\n>>> ")
bitcoins = float(bitcoinsString)
# print output
GBP = rate * bitcoins
print("You have", round(GBP,2), "GBP!")
# this kicks off our program & lets us both run and import the program
if __name__ == '__main__':
main()
remember: readability counts! >>> from urllib.request import urlopen
>>> from bs4 import BeautifulSoup
>>> url = "http://isitweekendyet.com/"
>>> source = urlopen(url).read()
>>> soup = BeautifulSoup(source, 'lxml')
>>> soup.body.div.string
'\nNO\n'
# an alternative:
>>> list(soup.body.stripped_strings)[0]
'NO'
many routes possible...GAurl = "https://gallery.generalassemb.ly/"
GAsource = urlopen(GAurl).read()
GAsoup = BeautifulSoup(GAsource, 'lxml')
def countLondon():
londonProjects = GAsoup.find_all('a', href='/?metro=london')
londonCount = len(londonProjects)
return londonCount
def getUniqueLocations():
metros = GAsoup.find_all('a', class_='metro')
uniqueLocations = []
for metro in metros:
location = metro.string
if location not in uniqueLocations:
uniqueLocations.append(location)
return uniqueLocations
def getLocationCounts():
metros = GAsoup.find_all('a', class_='metro')
locationCounts = {}
for metro in metros:
location = metro.string
if location not in locationCounts:
locationCounts[location] = 1
else:
locationCounts[location] = locationCounts[location] + 1
return locationCounts
# pro tip: pretty print the result
from pprint import pprint
locationCounts = getLocationCounts()
pprint(locationCounts)
By J Rogel