More robust (and faster!) handling of tweet fetching/analysis.

wrichert · wrichert · commit 84dcc8e1aec6 · 2013-10-12T15:35:36.000+02:00
diff --git a/ch06/README.md b/ch06/README.md
@@ -0,0 +1,14 @@
+Chapter 6 - Classification II - Sentiment Analysis
+==================================================
+
+When doing last code sanity checks for the book, Twitter
+was using the API 1.0, which did not require authentication.
+With its switch to version 1.1, this has now changed.
+
+It seems that you don't have already created your personal Twitter
+access keys and tokens. Please do so at
+[https://dev.twitter.com/docs/auth/tokens-devtwittercom](https://dev.twitter.com/docs/auth/tokens-devtwittercom) and paste the keys/secrets into twitterauth.py
+
+Note that some tweets might be missing when you are running install.py. 
+We experimented a bit with with the tweet-fetch-rate and found that
+max_tweets_per_hr=10000 works just fine, now that we are using OAuth. If you experience issues you might want to lower this value.
diff --git a/ch06/install.py b/ch06/install.py
@@ -14,9 +14,6 @@
 # Right now we use unauthenticated requests, which are rate-limited to 150/hr.
 # We use 125/hr to stay safe.
 #
-# We could more than double the download speed by using authentication with
-# OAuth logins. But for now, this is too much of a PITA to implement. Just let
-# the script run over a weekend and you'll have all the data.
 #
 # - Niek Sanders
 # njs@sananalytics.com
@@ -139,7 +136,7 @@ def download_tweets(fetch_list, raw_dir):
 os.mkdir(raw_dir)
 
 # stay within rate limits
- max_tweets_per_hr = 125
+ max_tweets_per_hr = 10000
 download_pause_sec = 3600 / max_tweets_per_hr
 
 # download tweets
@@ -159,7 +156,22 @@ def download_tweets(fetch_list, raw_dir):
 # urllib.urlretrieve(url, raw_dir + item[2] + '.json')
 
 # New Twitter API 1.1
- json_data = api.GetStatus(item[2]).AsJsonString()
+ try:
+ json_data = api.GetStatus(item[2]).AsJsonString()
+ except twitter.TwitterError, e:
+ fatal = False
+ for m in e.message:
+ if m['code'] == 34:
+ print "Tweet missing: ",item
+ # [{u'message': u'Sorry, that page does not exist', u'code': 34}]
+ fatal = False
+ break
+
+ if fatal:
+ raise
+ else:
+ continue
+
 with open(raw_dir + item[2] + '.json', "w") as f:
 f.write(json_data + "\n")
 
diff --git a/ch06/utils.py b/ch06/utils.py
@@ -54,9 +54,13 @@ def load_sanders_data(dirname=".", line_count=-1):
 
 tweet_fn = os.path.join(
 DATA_DIR, dirname, 'rawdata', '%s.json' % tweet_id)
- tweet = json.load(open(tweet_fn, "r"))
- if 'text' in tweet and tweet['user']['lang'] == "en":
+ try:
+ tweet = json.load(open(tweet_fn, "r"))
+ except IOError:
+ print("Tweet '%s' not found. Skip."%tweet_fn)
+ continue
 
+ if 'text' in tweet and tweet['user']['lang'] == "en":
 topics.append(topic)
 labels.append(label)
 tweets.append(tweet['text'])

-Original file line number
+Diff line change
 # Right now we use unauthenticated requests, which are rate-limited to 150/hr.
 # We use 125/hr to stay safe.
+#
 -# We could more than double the download speed by using authentication with
 -# OAuth logins. But for now, this is too much of a PITA to implement. Just let
 -# the script run over a weekend and you'll have all the data.
+#
 # - Niek Sanders
 # [email protected]
 os.mkdir(raw_dir)
 # stay within rate limits
 -max_tweets_per_hr=125
 +max_tweets_per_hr=10000
 download_pause_sec=3600/max_tweets_per_hr
 # download tweets
 # urllib.urlretrieve(url, raw_dir + item[2] + '.json')
 # New Twitter API 1.1
 -json_data=api.GetStatus(item[2]).AsJsonString()
 +try:
 +json_data=api.GetStatus(item[2]).AsJsonString()
 +excepttwitter.TwitterError, e:
 +fatal=False
 +formine.message:
 +ifm['code'] ==34:
 +print"Tweet missing: ",item
 +# [{u'message': u'Sorry, that page does not exist', u'code': 34}]
 +fatal=False
 +break
++
 +iffatal:
 +raise
 +else:
 +continue
++
 withopen(raw_dir+item[2] +'.json', "w") asf:
 f.write(json_data+"\n")