Building a Trump/Obama Tweet Classifier with 98% accuracy
In 1 hour!
bene@theodo.co.uk
Ben Ellerby
Data
MODEL
VALIDATE
cHAPTER 1
dATA
Data
MODEL
VALIDATE
Where caN I find LABELED data
When tackling a ML classification project you need lots* of labelled training data
*
By Phoenix7777 [CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0)], via Wikimedia Commons
By Phoenix7777 [CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0)], via Wikimedia Commons
By Phoenix7777 [CC BY-SA 4.0 (https://creativecommons.org/licenses/by-sa/4.0)], via Wikimedia Commons
Congress declare Obama's second term
@BarackObama
15.5K Tweets
1 every 8 hours
Joined: March 2007
@realDonaldTrump
36.4K Tweets
1 every 2 hours!
Joined: March 2009
Tweepy
An easy-to-use Python library for accessing the Twitter API.
Search for libraries to query APIs rather than going in directly for your script
Scrape
Scrape
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import json
#Twitter API credentials
consumer_key = "Consumer key goes here"
consumer_secret = "Consumer secret goes here"
access_key = "access key goes here"
access_secret = "access secret goes here"
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
#write tweet objects to JSON
file = open('tweet.json', 'wb')
print "Writing tweet objects to JSON please wait..."
for status in alltweets:
json.dump(status._json,file,sort_keys = True,indent = 4)
#close the file
print "Done"
file.close()
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("@realDonaldTrump")
tweet.py
Open with Google Docs
Displaying tweet.py.
https://drive.google.com/file/d/0Bw1LIIbSl0xuNnJ0N1ppSkRjQjQ/view
Scrape
#!/usr/bin/env python
# encoding: utf-8
import tweepy #https://github.com/tweepy/tweepy
import json
#Twitter API credentials
consumer_key = "xxxxxxxxxx"
consumer_secret = "xxxxxxxx"
access_key = "xxxxxxxxx"
access_secret = "xxxxxxxxxxx"
def get_all_tweets(screen_name):
#Twitter only allows access to a users most recent 3240 tweets with this method
#authorize twitter, initialize tweepy
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
#initialize a list to hold all the tweepy Tweets
alltweets = []
#make initial request for most recent tweets (200 is the maximum allowed count)
new_tweets = api.user_timeline(screen_name = screen_name,count=200)
#save most recent tweets
alltweets.extend(new_tweets)
#save the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
#keep grabbing tweets until there are no tweets left to grab
while len(new_tweets) > 0:
#all subsiquent requests use the max_id param to prevent duplicates
new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest)
#save most recent tweets
alltweets.extend(new_tweets)
#update the id of the oldest tweet less one
oldest = alltweets[-1].id - 1
print "...%s tweets downloaded so far" % (len(alltweets))
#write tweet objects to JSON
file = open('tweet2.json', 'wb')
print "Writing tweet objects to JSON please wait..."
for status in alltweets:
json.dump(status._json,file,sort_keys = True,indent = 4)
#close the file
print "Done"
file.close()
if __name__ == '__main__':
#pass in the username of the account you want to download
get_all_tweets("@barackobama")
Scrape
{
"contributors": null,
"coordinates": null,
"created_at": "Sun Jan 22 18:07:02 +0000 2017",
"entities": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "nyti.ms/2jAhB2P",
"expanded_url": "https://nyti.ms/2jAhB2P",
"indices": [
104,
127
],
"url": "https://t.co/so1luBcszV"
}
],
"user_mentions": []
},
"favorite_count": 212300,
"favorited": false,
"geo": null,
"id": 823230505546117120,
"id_str": "823230505546117120",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 31854,
"retweeted": false,
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"text": "I read letters like these every single day. It was one of the best parts of the job \u2013 hearing from you. https://t.co/so1luBcszV",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Mon Mar 05 22:08:25 +0000 2007",
"default_profile": false,
"default_profile_image": false,
"description": "Dad, husband, President, citizen.",
"entities": {
"description": {
"urls": []
},
"url": {
"urls": [
{
"display_url": "obama.org",
"expanded_url": "https://www.obama.org/",
"indices": [
0,
23
],
"url": "https://t.co/93Y27HEnnX"
}
]
}
},
"favourites_count": 10,
"follow_request_sent": false,
"followers_count": 84156680,
"following": false,
"friends_count": 631657,
"geo_enabled": false,
"has_extended_profile": true,
"id": 813286,
"id_str": "813286",
"is_translation_enabled": true,
"is_translator": false,
"lang": "en",
"listed_count": 221017,
"location": "Washington, DC",
"name": "Barack Obama",
"notifications": false,
"profile_background_color": "77B0DC",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/451819093436268544/kLbRvwBg.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/451819093436268544/kLbRvwBg.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/813286/1484945688",
"profile_image_url": "http://pbs.twimg.com/profile_images/822547732376207360/5g0FC8XX_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/822547732376207360/5g0FC8XX_normal.jpg",
"profile_link_color": "2574AD",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "C2E0F6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "BarackObama",
"statuses_count": 15434,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "regular",
"url": "https://t.co/93Y27HEnnX",
"utc_offset": -18000,
"verified": true
}
}{
"contributors": null,
"coordinates": null,
"created_at": "Fri Jan 20 21:17:01 +0000 2017",
"entities": {
"hashtags": [],
"media": [
{
"display_url": "pic.twitter.com/Uf7oEvkZF3",
"expanded_url": "https://twitter.com/ObamaFoundation/status/822537741195997186/video/1",
"id": 822536684919279619,
"id_str": "822536684919279619",
"indices": [
61,
84
],
"media_url": "http://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"sizes": {
"large": {
"h": 576,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 338,
"resize": "fit",
"w": 600
},
"small": {
"h": 191,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 822537741195997186,
"source_status_id_str": "822537741195997186",
"source_user_id": 2293315159,
"source_user_id_str": "2293315159",
"type": "photo",
"url": "https://t.co/Uf7oEvkZF3"
}
],
"symbols": [],
"urls": [
{
"display_url": "Obama.org",
"expanded_url": "http://Obama.org",
"indices": [
37,
60
],
"url": "https://t.co/mA9MSHmi7o"
}
],
"user_mentions": [
{
"id": 2293315159,
"id_str": "2293315159",
"indices": [
3,
19
],
"name": "The Obama Foundation",
"screen_name": "ObamaFoundation"
}
]
},
"extended_entities": {
"media": [
{
"additional_media_info": {
"description": "",
"embeddable": true,
"monetizable": false,
"source_user": {
"contributors_enabled": false,
"created_at": "Wed Jan 15 20:02:36 +0000 2014",
"default_profile": false,
"default_profile_image": false,
"description": "The Obama Foundation is a living, working start-up for citizenship \u2014 an ongoing project for us to shape, together, what it means to be a good citizen.",
"entities": {
"description": {
"urls": []
},
"url": {
"urls": [
{
"display_url": "obama.org",
"expanded_url": "http://obama.org",
"indices": [
0,
23
],
"url": "https://t.co/0UVvR5L6vm"
}
]
}
},
"favourites_count": 65,
"follow_request_sent": false,
"followers_count": 322426,
"following": false,
"friends_count": 85,
"geo_enabled": false,
"has_extended_profile": false,
"id": 2293315159,
"id_str": "2293315159",
"is_translation_enabled": false,
"is_translator": false,
"lang": "en",
"listed_count": 1409,
"location": "",
"name": "The Obama Foundation",
"notifications": false,
"profile_background_color": "EEEEEE",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/446716814441988096/NfEVfyB4.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/446716814441988096/NfEVfyB4.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2293315159/1422497174",
"profile_image_url": "http://pbs.twimg.com/profile_images/822253020012511233/C0HXLxod_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/822253020012511233/C0HXLxod_normal.jpg",
"profile_link_color": "505E73",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": false,
"protected": false,
"screen_name": "ObamaFoundation",
"statuses_count": 455,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "none",
"url": "https://t.co/0UVvR5L6vm",
"utc_offset": -18000,
"verified": true
},
"title": ""
},
"display_url": "pic.twitter.com/Uf7oEvkZF3",
"expanded_url": "https://twitter.com/ObamaFoundation/status/822537741195997186/video/1",
"id": 822536684919279619,
"id_str": "822536684919279619",
"indices": [
61,
84
],
"media_url": "http://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"sizes": {
"large": {
"h": 576,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 338,
"resize": "fit",
"w": 600
},
"small": {
"h": 191,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"source_status_id": 822537741195997186,
"source_status_id_str": "822537741195997186",
"source_user_id": 2293315159,
"source_user_id_str": "2293315159",
"type": "video",
"url": "https://t.co/Uf7oEvkZF3",
"video_info": {
"aspect_ratio": [
16,
9
],
"duration_millis": 57057,
"variants": [
{
"bitrate": 832000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/vid/640x360/maPMRQAxm9xmdHnz.mp4"
},
{
"content_type": "application/x-mpegURL",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/pl/8ux8qYeUzJguKxaP.m3u8"
},
{
"content_type": "application/dash+xml",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/pl/8ux8qYeUzJguKxaP.mpd"
},
{
"bitrate": 320000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/vid/320x180/GpGseeSPr_17U7HH.mp4"
},
{
"bitrate": 2176000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/vid/1280x720/hyOOosD_xZTWZxy9.mp4"
}
]
}
}
]
},
"favorite_count": 0,
"favorited": false,
"geo": null,
"id": 822553543479541761,
"id_str": "822553543479541761",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 44641,
"retweeted": false,
"retweeted_status": {
"contributors": null,
"coordinates": null,
"created_at": "Fri Jan 20 20:14:14 +0000 2017",
"entities": {
"hashtags": [],
"media": [
{
"display_url": "pic.twitter.com/Uf7oEvkZF3",
"expanded_url": "https://twitter.com/ObamaFoundation/status/822537741195997186/video/1",
"id": 822536684919279619,
"id_str": "822536684919279619",
"indices": [
40,
63
],
"media_url": "http://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"sizes": {
"large": {
"h": 576,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 338,
"resize": "fit",
"w": 600
},
"small": {
"h": 191,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": "https://t.co/Uf7oEvkZF3"
}
],
"symbols": [],
"urls": [
{
"display_url": "Obama.org",
"expanded_url": "http://Obama.org",
"indices": [
16,
39
],
"url": "https://t.co/mA9MSHmi7o"
}
],
"user_mentions": []
},
"extended_entities": {
"media": [
{
"additional_media_info": {
"description": "",
"embeddable": true,
"monetizable": false,
"title": ""
},
"display_url": "pic.twitter.com/Uf7oEvkZF3",
"expanded_url": "https://twitter.com/ObamaFoundation/status/822537741195997186/video/1",
"id": 822536684919279619,
"id_str": "822536684919279619",
"indices": [
40,
63
],
"media_url": "http://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"media_url_https": "https://pbs.twimg.com/ext_tw_video_thumb/822536684919279619/pu/img/Oix1rNF0c1epLgOn.jpg",
"sizes": {
"large": {
"h": 576,
"resize": "fit",
"w": 1024
},
"medium": {
"h": 338,
"resize": "fit",
"w": 600
},
"small": {
"h": 191,
"resize": "fit",
"w": 340
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "video",
"url": "https://t.co/Uf7oEvkZF3",
"video_info": {
"aspect_ratio": [
16,
9
],
"duration_millis": 57057,
"variants": [
{
"bitrate": 832000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/vid/640x360/maPMRQAxm9xmdHnz.mp4"
},
{
"content_type": "application/x-mpegURL",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/pl/8ux8qYeUzJguKxaP.m3u8"
},
{
"content_type": "application/dash+xml",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/pl/8ux8qYeUzJguKxaP.mpd"
},
{
"bitrate": 320000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/vid/320x180/GpGseeSPr_17U7HH.mp4"
},
{
"bitrate": 2176000,
"content_type": "video/mp4",
"url": "https://video.twimg.com/ext_tw_video/822536684919279619/pu/vid/1280x720/hyOOosD_xZTWZxy9.mp4"
}
]
}
}
]
},
"favorite_count": 138777,
"favorited": false,
"geo": null,
"id": 822537741195997186,
"id_str": "822537741195997186",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"possibly_sensitive": false,
"retweet_count": 44641,
"retweeted": false,
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"text": "Add your voice: https://t.co/mA9MSHmi7o https://t.co/Uf7oEvkZF3",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Wed Jan 15 20:02:36 +0000 2014",
"default_profile": false,
"default_profile_image": false,
"description": "The Obama Foundation is a living, working start-up for citizenship \u2014 an ongoing project for us to shape, together, what it means to be a good citizen.",
"entities": {
"description": {
"urls": []
},
"url": {
"urls": [
{
"display_url": "obama.org",
"expanded_url": "http://obama.org",
"indices": [
0,
23
],
"url": "https://t.co/0UVvR5L6vm"
}
]
}
},
"favourites_count": 65,
"follow_request_sent": false,
"followers_count": 322426,
"following": false,
"friends_count": 85,
"geo_enabled": false,
"has_extended_profile": false,
"id": 2293315159,
"id_str": "2293315159",
"is_translation_enabled": false,
"is_translator": false,
"lang": "en",
"listed_count": 1409,
"location": "",
"name": "The Obama Foundation",
"notifications": false,
"profile_background_color": "EEEEEE",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/446716814441988096/NfEVfyB4.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/446716814441988096/NfEVfyB4.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2293315159/1422497174",
"profile_image_url": "http://pbs.twimg.com/profile_images/822253020012511233/C0HXLxod_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/822253020012511233/C0HXLxod_normal.jpg",
"profile_link_color": "505E73",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": false,
"protected": false,
"screen_name": "ObamaFoundation",
"statuses_count": 455,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "none",
"url": "https://t.co/0UVvR5L6vm",
"utc_offset": -18000,
"verified": true
}
},
"source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</a>",
"text": "RT @ObamaFoundation: Add your voice: https://t.co/mA9MSHmi7o https://t.co/Uf7oEvkZF3",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Mon Mar 05 22:08:25 +0000 2007",
"default_profile": false,
"default_profile_image": false,
"description": "Dad, husband, President, citizen.",
"entities": {
"description": {
"urls": []
},
"url": {
"urls": [
{
"display_url": "obama.org",
"expanded_url": "https://www.obama.org/",
"indices": [
0,
23
],
"url": "https://t.co/93Y27HEnnX"
}
]
}
},
"favourites_count": 10,
"follow_request_sent": false,
"followers_count": 84156680,
"following": false,
"friends_count": 631657,
"geo_enabled": false,
"has_extended_profile": true,
"id": 813286,
"id_str": "813286",
"is_translation_enabled": true,
"is_translator": false,
"lang": "en",
"listed_count": 221017,
"location": "Washington, DC",
"name": "Barack Obama",
"notifications": false,
"profile_background_color": "77B0DC",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/451819093436268544/kLbRvwBg.png",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/451819093436268544/kLbRvwBg.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/813286/1484945688",
"profile_image_url": "http://pbs.twimg.com/profile_images/822547732376207360/5g0FC8XX_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/822547732376207360/5g0FC8XX_normal.jpg",
"profile_link_color": "2574AD",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "C2E0F6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "BarackObama",
"statuses_count": 15434,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "regular",
"url": "https://t.co/93Y27HEnnX",
"utc_offset": -18000,
"verified": true
}
}{
Scrape
{
"contributors": null,
"coordinates": null,
"created_at": "Sun Feb 05 00:48:12 +0000 2017",
"entities": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": []
},
"favorite_count": 76237,
"favorited": false,
"geo": null,
"id": 828042506851934209,
"id_str": "828042506851934209",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"retweet_count": 17572,
"retweeted": false,
"source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
"text": "The judge opens up our country to potential terrorists and others that do not have our best interests at heart. Bad people are very happy!",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Wed Mar 18 13:46:38 +0000 2009",
"default_profile": false,
"default_profile_image": false,
"description": "45th President of the United States of America",
"entities": {
"description": {
"urls": []
}
},
"favourites_count": 45,
"follow_request_sent": false,
"followers_count": 23793367,
"following": false,
"friends_count": 41,
"geo_enabled": true,
"has_extended_profile": false,
"id": 25073877,
"id_str": "25073877",
"is_translation_enabled": true,
"is_translator": false,
"lang": "en",
"listed_count": 61418,
"location": "Washington, DC",
"name": "Donald J. Trump",
"notifications": false,
"profile_background_color": "6D5C18",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_tile": true,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/25073877/1485301108",
"profile_image_url": "http://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_link_color": "0D5B73",
"profile_sidebar_border_color": "BDDCAD",
"profile_sidebar_fill_color": "C5CEC0",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "realDonaldTrump",
"statuses_count": 34437,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "regular",
"url": null,
"utc_offset": -18000,
"verified": true
}
}{
"contributors": null,
"coordinates": null,
"created_at": "Sun Feb 05 00:34:50 +0000 2017",
"entities": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": [
{
"id": 23970102,
"id_str": "23970102",
"indices": [
15,
29
],
"name": "Bill O'Reilly",
"screen_name": "oreillyfactor"
}
]
},
"favorite_count": 44351,
"favorited": false,
"geo": null,
"id": 828039143318024194,
"id_str": "828039143318024194",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"retweet_count": 7915,
"retweeted": false,
"source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
"text": "Interview with @oreillyfactor on Fox Network - 4:00 P.M. (prior to Super Bowl). Enjoy!",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Wed Mar 18 13:46:38 +0000 2009",
"default_profile": false,
"default_profile_image": false,
"description": "45th President of the United States of America",
"entities": {
"description": {
"urls": []
}
},
"favourites_count": 45,
"follow_request_sent": false,
"followers_count": 23793367,
"following": false,
"friends_count": 41,
"geo_enabled": true,
"has_extended_profile": false,
"id": 25073877,
"id_str": "25073877",
"is_translation_enabled": true,
"is_translator": false,
"lang": "en",
"listed_count": 61418,
"location": "Washington, DC",
"name": "Donald J. Trump",
"notifications": false,
"profile_background_color": "6D5C18",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_tile": true,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/25073877/1485301108",
"profile_image_url": "http://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_link_color": "0D5B73",
"profile_sidebar_border_color": "BDDCAD",
"profile_sidebar_fill_color": "C5CEC0",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "realDonaldTrump",
"statuses_count": 34437,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "regular",
"url": null,
"utc_offset": -18000,
"verified": true
}
}{
"contributors": null,
"coordinates": null,
"created_at": "Sat Feb 04 23:37:59 +0000 2017",
"entities": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": []
},
"favorite_count": 64543,
"favorited": false,
"geo": null,
"id": 828024835670413312,
"id_str": "828024835670413312",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"retweet_count": 13015,
"retweeted": false,
"source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
"text": "Why aren't the lawyers looking at and using the Federal Court decision in Boston, which is at conflict with ridiculous lift ban decision?",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Wed Mar 18 13:46:38 +0000 2009",
"default_profile": false,
"default_profile_image": false,
"description": "45th President of the United States of America",
"entities": {
"description": {
"urls": []
}
},
"favourites_count": 45,
"follow_request_sent": false,
"followers_count": 23793367,
"following": false,
"friends_count": 41,
"geo_enabled": true,
"has_extended_profile": false,
"id": 25073877,
"id_str": "25073877",
"is_translation_enabled": true,
"is_translator": false,
"lang": "en",
"listed_count": 61418,
"location": "Washington, DC",
"name": "Donald J. Trump",
"notifications": false,
"profile_background_color": "6D5C18",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_tile": true,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/25073877/1485301108",
"profile_image_url": "http://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_link_color": "0D5B73",
"profile_sidebar_border_color": "BDDCAD",
"profile_sidebar_fill_color": "C5CEC0",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "realDonaldTrump",
"statuses_count": 34437,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "regular",
"url": null,
"utc_offset": -18000,
"verified": true
}
}{
"contributors": null,
"coordinates": null,
"created_at": "Sat Feb 04 21:44:49 +0000 2017",
"entities": {
"hashtags": [],
"symbols": [],
"urls": [],
"user_mentions": []
},
"favorite_count": 99941,
"favorited": false,
"geo": null,
"id": 827996357252243456,
"id_str": "827996357252243456",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "en",
"place": null,
"retweet_count": 23019,
"retweeted": false,
"source": "<a href=\"http://twitter.com/download/android\" rel=\"nofollow\">Twitter for Android</a>",
"text": "Because the ban was lifted by a judge, many very bad and dangerous people may be pouring into our country. A terrible decision",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Wed Mar 18 13:46:38 +0000 2009",
"default_profile": false,
"default_profile_image": false,
"description": "45th President of the United States of America",
"entities": {
"description": {
"urls": []
}
},
"favourites_count": 45,
"follow_request_sent": false,
"followers_count": 23793367,
"following": false,
"friends_count": 41,
"geo_enabled": true,
"has_extended_profile": false,
"id": 25073877,
"id_str": "25073877",
"is_translation_enabled": true,
"is_translator": false,
"lang": "en",
"listed_count": 61418,
"location": "Washington, DC",
"name": "Donald J. Trump",
"notifications": false,
"profile_background_color": "6D5C18",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/530021613/trump_scotland__43_of_70_cc.jpg",
"profile_background_tile": true,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/25073877/1485301108",
"profile_image_url": "http://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1980294624/DJT_Headshot_V2_normal.jpg",
"profile_link_color": "0D5B73",
"profile_sidebar_border_color": "BDDCAD",
"profile_sidebar_fill_color": "C5CEC0",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "realDonaldTrump",
"statuses_count": 34437,
"time_zone": "Eastern Time (US & Canada)",
"translator_type": "regular",
"url": null,
"utc_offset": -18000,
"verified": true
}
}{
Cleaning
}{
Not valid JSON
Not even valid jsonlines
},{
Write a sed, regex or AWK script to clean your data to the format preferred by your language, don't write custom parsing logic
cHAPTER 2
Model
Data
MODEL
VALIDATE
Time For sOME OF THAT CUTTING EDGE ML/AI...
1763
"The decomposition of large probabilistic domains into weakly connected subsets via conditional independence is one of the most important developments in the recent history of AI."
Russell & Norvig
Artificial Intelligence
i
BAYES
NPM package for building a text based Bayesian Classifier.
https://www.npmjs.com/package/bayes
Train
var bayes = require('bayes')
var classifier = bayes()
// Train on historic data
classifier.learn('amazing, awesome movie!! 👍', 'positive')
classifier.learn('this sucks, so rubbish! 💩', 'negative')
😃
var bayes = require('bayes')
var classifier = bayes()
// Train on historic data
// Trump
classifier.learn('If Obama resigns from office NOW,
thereby doing a great service to the country
—I will give him free lifetime golf at any one of my courses!',
'@realDonaldTrump')
// Obama
classifier.learn('
Happy Back to the Future Day, @RealMikeFox!
Ever think about the fact that we live in the future
we dreamed of then? That's heavy, man.',
'@POTUS44')
Choosing a Feature set
Attributes/measures of each data element that are used in classification e.g. length, words
- Word length
- Term frequency (TFIDF)
- Hash tags
A first step of applying ML classification is to choose a subset of the features.
How often a term is used.
TERM FREQUENCY
- Tokenized - split up into individual words without punctuation
- Frequency Table - mapping each token to the number of times it’s used.
var defaultTokenizer = function (text) {
var rgxPunctuation = /[^(a-zA-ZA-Яa-я0-9_)+\s]/g
var sanitized = text.replace(rgxPunctuation, ' ')
return sanitized.split(/\s+/)
}
From: https://github.com/ttezel/bayes/blob/master/lib/naive_bayess
More aDvanced NLP
- TFIDF
- Stemming
- Stop word removal
- Lemmatization
-
GloVe:
- Global Vectors for Word Representation
UNDER THE HOOD
- Multinomial Model, not a Multi-variate Bernoulli Model, approach.
- "The multinomial model is found to be almost uniformly better than the multi-variate Bernoulli model" [1]
- Bag-of-words model, i.e. ignoring order and grammar, but keeping multiplicity.
- Words are the events and collections of words the documents.
- Laplacean prior used, starting each word's prior at 1.
[1] Andrew McCallum and Kamal Nigam. 1998. A comparison of event models for Naive Bayes text classification. In Proc. of the AAAI-98 Workshop on Learning for Text Categorization, pages 41--48.
var bayes = require('bayes');
var trumpTweets = require('./tweetFormatted.json');
var obamaTweets = require('./tweetFormatted2.json');
const data = [{name: 'obama', tweets: obamaTweets}, {name: 'trump', tweets: trumpTweets}];
// Train the model
var classifier = bayes();
for (var president of data) {
console.log(`training model with historical ${president.name} data.`)
for (var tweet of president.tweets) {
classifier.learn(tweet.text, president.name);
}
}
// Classify some new data
console.log(classifier.categorize('Lets build a wall!')); // Trump
console.log(classifier.categorize('I will win against hillary')); // Trump
console.log(classifier.categorize('Climate change is important.')); //Obama
console.log(classifier.categorize('Obamacare has helped americans.')); //Obama
DEMO
Go to: sli.do
Event Code: 'J156'
Tweet and Category
Chapter 3
Validate
Data
MODEL
VALIDATE
Original Data Set
Training Data
Test Data
Original Data Set
Training Data
Classifier
Training
Validating
Test Data
Original Data Set
Training Data
Classifier
Training
classifier.learn(x, a)
Validating
classifier.classify(y)
'b'
Test Data
Choosing Test Data
- Holdout method
- 70%/30% (train/test)
- Cross Validation
LEAVE P OUT CROSS VALIDATION?
TEST |
---|
TEST |
TRAIN |
TRAIN |
TEST |
---|
TRAIN |
TEST |
TRAIN |
TEST |
---|
TRAIN |
TRAIN |
TEST |
TRAIN |
TEST |
TEST |
TRAIN |
TRAIN |
TRAIN |
TEST |
TEST |
TRAIN |
TEST |
TRAIN |
TEST |
Complexity Grows
- The goal of cross validation is to have a measure of the model's performance (fit) that is independent of the data set we have.
- TP, TN, FP, FN
MEASURE OF FIT
var bayes = require('bayes');
var classifier = bayes();
var trumpTweets = require('./tweetFormatted.json');
var obamaTweets = require('./tweetFormatted2.json');
const data = [{name: 'trump', tweets: trumpTweets}, {name: 'obama', tweets: obamaTweets}];
var totalDataCount = trumpTweets.length + obamaTweets.length;
var tp = 0;
var tn = 0;
var fp = 0;
var fn = 0;
var t0 = new Date().getTime();
// Iterate through every historic data element index
for (var testIndex=0; testIndex<totalDataCount; testIndex++){
console.log(testIndex);
// instantiate a new model
var classifier = bayes();
var testData = [];
var counter = 0;
for (var president of data) {
for (var tweet of president.tweets) {
counter ++;
if (counter === testIndex) {
// If equal to test Index then ommit from training.
testData.push({president: president.name, tweet: tweet});
} else {
// Train on all other data elements.
classifier.learn(tweet.text, president.name);
}
}
}
// Use test data.
for (var test of testData) {
if (classifier.categorize(test.tweet.text) === test.president) {
if (test.president === 'obama') {
tp++;
} else {
tn ++;
}
} else {
if (test.president === 'obama') {
fp++;
} else {
fn++;
}
}
}
}
var t1 = new Date().getTime();
console.log('total tests: ', (tp + tn + fp + fn));
console.log(`TP = ${tp}`);
console.log(`TN = ${tn}`);
console.log(`FP = ${fp}`);
console.log(`FN = ${fn}`);
console.log('Took ' + (t1 - t0) + ' milliseconds.')
3195
3123
27
82
CONFUSION MATRIX
Actual
Predicted
accuracy
Why JS?
- Privacy: Everything stays client side (GDPR?)
- Novel user experiences
- Sensor data from mobile devices
But i love tensor fLOW...
eFFICIENT?
Uses client GPU via WebGL
DATA
MODEL
VALIDATE
Conclusion
- You can use machine learning techniques without going deep into maths and theory.
- There are some great libraries to simplify machine learning application
- You have access to more labelled historic data than you think; be creative.
bene@theodo.co.uk
https://www.linkedin.com/in/benjaminellerby/
False Classification
Subtitle
Building a Trump / Obama Tweet Classifier with 98% Accuracy
By Ben Ellerby
Building a Trump / Obama Tweet Classifier with 98% Accuracy
Talk given at the London Deep Learning & AI Meetup.
- 949