Skip to content

DominiqueLoyer/misc_Python

Folders and files

NameName
Last commit message
Last commit date

Latest commit

History

7 Commits

Repository files navigation

Misc stuff in Python to use at your own risk (Disclaimer!)

Buy me a coffeeSponsor on GitHubDOI

\author Dominique Loyer

``` Pythonfromtweepy.streamingimportStreamListenerfromtweepyimportOAuthHandlerfromtweepyimportStreamimporttwitter_credentials# # # # TWITTER STREAMER # # # # classTwitterStreamer(): """  Class for streaming and processing live tweets. """def__init__(self): passdefstream_tweets(self, fetched_tweets_filename, hash_tag_list): # This handles Twitter authetification and the connection to Twitter Streaming API listener=StdOutListener(fetched_tweets_filename) auth=OAuthHandler(twitter_credentials.CONSUMER_KEY, twitter_credentials.CONSUMER_SECRET) auth.set_access_token(twitter_credentials.ACCESS_TOKEN, twitter_credentials.ACCESS_TOKEN_SECRET) stream=Stream(auth, listener) # This line filter Twitter Streams to capture data by the keywords: stream.filter(track=hash_tag_list) # # # # TWITTER STREAM LISTENER # # # # classStdOutListener(StreamListener): """  This is a basic listener that just prints received tweets to stdout. """def__init__(self, fetched_tweets_filename): self.fetched_tweets_filename=fetched_tweets_filenamedefon_data(self, data): try: print(data) withopen(self.fetched_tweets_filename, 'a') astf: tf.write(data) returnTrueexceptBaseExceptionase: print("Error on_data %s"%str(e)) returnTruedefon_error(self, status): print(status) if__name__=='__main__': # Authenticate using config.py and connect to Twitter Streaming API. hash_tag_list= ["donal trump", "hillary clinton", "barack obama", "bernie sanders"] fetched_tweets_filename="tweets.txt"twitter_streamer=TwitterStreamer() twitter_streamer.stream_tweets(fetched_tweets_filename, hash_tag_list)

RDF python

!pipinstallrdflib#%% fromrdflibimportGraph#%%  !pipshowrdflib#%% filename="ex001.rq"g=rdflib.Graph() result=g.parse(filename, format='rq') print(result) query=""" SELECT ?person WHERE{ ?person <http://dbpedia.org/ontology/hasName> "Idham Al-Taif Mahmoud"} """g.query(query) forstmting: print(stmt) #%% result=g.parse=('http://dbpedia.org/ressource/Michael_Jackson') #%% filename="turle.ttl"g=rdflib.Graph() result=g.parse(filename, format='ttl') print(result) query=""" SELECT ?person WHERE{ ?person <http://dbpedia.org/ontology/hasName> "Idham Al-Taif Mahmoud"} """g.query(query) forstmting: print(stmt) #%% print(result) #%% query=""" SELECT ?person WHERE{ ?person <http://dbpedia.org/ressource/Michael_Jackson> "Michael Jackson"} """#%% g.query(query) forstmting: print(stmt#%% print(stmt) #%% print(query) #%% g.query(query) forstmting: print(stmt#%% g.query(query) forstmting: print(stmt) #%% query=PREFIXd: <http://learningsparql.com/ns/demo#> SELECT ?person WHERE{?persond:homeTel"(229) 276-5135" . } #%% query=""" PREFIX d: <http://learningsparql.com/ns/demo#> SELECT ?person WHERE {?person d:homeTel "(229) 276-5135" . } """#%% print(query) #%% arq--dataex002.ttl--queryex003.rq

Random forest Iris

#%% importnumpyasnpfromsklearn.model_selectionimporttrain_test_splitfromsklearnimportdatasetsfromsklearnimportsvmiris=datasets.load_iris() iris.data.shape, iris.target.shape#%% X_train, X_test, y_train, y_test=train_test_split( iris.data, iris.target, test_size=0.4, random_state=0) X_train.shape, y_train.shapeX_test.shape, y_test.shapeclf=svm.SVC(kernel='linear', C=1).fit(X_train, y_train) clf.score(X_test, y_test) #%% fromsklearn.model_selectionimportcross_val_scoreclf=svm.SVC(kernel='linear', C=1) scores=cross_val_score(clf, iris.data, iris.target, cv=5) scores#%% print("Accuracy: %0.2f (+/- %0.2f)"% (scores.mean(), scores.std() *2)) #%%

knn classifier IRIS

fromsklearn.datasetsimportload_irisfromsklearn.model_selectionimporttrain_test_splitfromsklearn.neighborsimportKNeighborsClassifierfromsklearn.metricsimportaccuracy_score# Load Iris dataset (a well-known classification dataset) iris=load_iris() X=iris.datay=iris.target# Split the data into training and testing sets X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.3, random_state=42) # Create KNeighborsClassifier with k=3 knn_classifier=KNeighborsClassifier(n_neighbors=3) # Fit the classifier on the training data knn_classifier.fit(X_train, y_train) # Make predictions on the test set predictions=knn_classifier.predict(X_test) # Calculate accuracy accuracy=accuracy_score(y_test, predictions) print("Accuracy:", accuracy)

Tensor Flow

# Import TensorFlow importtensorflowastffromtensorflow.keras.modelsimportSequentialfromtensorflow.keras.layersimportDense# Sample dataset (replace this with your dataset) # Assuming X_train and y_train are your input and output data # Modify this according to your actual dataset X_train= ... # Your input data y_train= ... # Your output data # Define the neural network model model=Sequential([ Dense(64, activation='relu', input_shape=(input_shape,)), # Add a hidden layer with 64 neurons and ReLU activation Dense(32, activation='relu'), # Add another hidden layer with 32 neurons and ReLU activation Dense(1, activation='sigmoid') # Output layer with 1 neuron for binary classification (sigmoid activation)  ]) # Compile the model model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy']) # Train the model model.fit(X_train, y_train, epochs=10, batch_size=32) # Adjust epochs and batch_size as needed

Time Series

#%% importwarningsimportitertoolsimportnumpyasnpimportmatplotlib.pyplotaspltwarnings.filterwarnings("ignore") plt.style.use('fivethirtyeight') importpandasaspdimportstatsmodels.apiassmimportmatplotlibmatplotlib.rcParams['axes.labelsize'] =14matplotlib.rcParams['xtick.labelsize'] =12matplotlib.rcParams['ytick.labelsize'] =12matplotlib.rcParams['text.color'] ='k'#%% Wearegoingtodotimeseriesanalysisandforecastingforfurnituresales. #%% df=pd.read_excel("Superstore.xls") furniture=df.loc[df['Category'] =='Furniture'] #%% md Wehaveagood4-yearfurnituresalesdata. #%% furniture['Order Date'].min() #%% furniture['Order Date'].max() #%% md ## Data preprocessing Thisstepincludesremovingcolumnswedonotneed, checkmissingvalues, aggregatesalesbydateandsoon. #%% cols= ['Row ID', 'Order ID', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit'] furniture.drop(cols, axis=1, inplace=True) furniture=furniture.sort_values('Order Date') #%% furniture.isnull().sum() #%% furniture=furniture.groupby('Order Date')['Sales'].sum().reset_index() #%% furniture.head() #%% md ## Indexing with time series data #%% furniture=furniture.set_index('Order Date') furniture.index#%% md Ourcurrentdatetimedatacanbetrickytoworkwith, therefore, wewillusetheaveragesdailysalesvalueforthatmonthinstead, andweareusingthestartofeachmonthasthetimestamp. #%% y=furniture['Sales'].resample('MS').mean() #%% md Haveaquickpeek2017salesdata. #%% y['2017':] #%% md ## Visualizing furniture sales time series data #%% y.plot(figsize=(15, 6)) plt.show() #%% md Somedistinguishablepatternsappearwhenweplotthedata. Thetime-serieshasseasonalitypattern, suchassalesarealwayslowatthebeginningoftheyearandhighattheendoftheyear. Thereisalwaysastrongupwardtrendwithinanysingleyearwithacoupleoflowmonthsinthemidoftheyear. Wecanalsovisualizeourdatausingamethodcalledtime-seriesdecompositionthatallowsustodecomposeourtimeseriesintothreedistinctcomponents: trend, seasonality, andnoise. #%% frompylabimportrcParamsrcParams['figure.figsize'] =18, 8decomposition=sm.tsa.seasonal_decompose(y, model='additive') fig=decomposition.plot() plt.show() #%% md Theplotaboveclearlyshowsthatthesalesoffurnitureisunstable, alongwithitsobviousseasonality. #%% md ## Time series forecasting with ARIMA Wearegoingtoapplyoneofthemostcommonlyusedmethodfortime-seriesforecasting, knownasARIMA, whichstandsforAutoregressiveIntegratedMovingAverage. ParameterSelectionfortheARIMATimeSeriesModel#%% p=d=q=range(0, 2) pdq=list(itertools.product(p, d, q)) seasonal_pdq= [(x[0], x[1], x[2], 12) forxinlist(itertools.product(p, d, q))] print('Examples of parameter combinations for Seasonal ARIMA...') print('SARIMAX:{} x{}'.format(pdq[1], seasonal_pdq[1])) print('SARIMAX:{} x{}'.format(pdq[1], seasonal_pdq[2])) print('SARIMAX:{} x{}'.format(pdq[2], seasonal_pdq[3])) print('SARIMAX:{} x{}'.format(pdq[2], seasonal_pdq[4])) #%% forparaminpdq: forparam_seasonalinseasonal_pdq: try: mod=sm.tsa.statespace.SARIMAX(y, order=param, seasonal_order=param_seasonal, enforce_stationarity=False, enforce_invertibility=False) results=mod.fit() print('ARIMA{}x{}12 - AIC:{}'.format(param, param_seasonal, results.aic)) except: continue#%% mod=sm.tsa.statespace.SARIMAX(y, order=(1, 1, 1), seasonal_order=(1, 1, 0, 12), enforce_stationarity=False, enforce_invertibility=False) results=mod.fit() print(results.summary().tables[1]) #%% results.plot_diagnostics(figsize=(16, 8)) plt.show() #%% md ## Validating forecasts Tohelpusunderstandtheaccuracyofourforecasts, wecomparepredictedsalestorealsalesofthetimeseries, andwesetforecaststostartat2017-07-01totheendofthedata. #%% pred=results.get_prediction(start=pd.to_datetime('2017-01-01'), dynamic=False) pred_ci=pred.conf_int() ax=y['2014':].plot(label='observed') pred.predicted_mean.plot(ax=ax, label='One-step ahead Forecast', alpha=.7, figsize=(14, 7)) ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.2) ax.set_xlabel('Date') ax.set_ylabel('Furniture Sales') plt.legend() plt.show() #%% md Thelineplotisshowingtheobservedvaluescomparedtotherollingforecastpredictions. Overall, ourforecastsalignwiththetruevaluesverywell, showinganupwardtrendstartsfromthebeginningoftheyear. #%% y_forecasted=pred.predicted_meany_truth=y['2017-01-01':] # Compute the mean square error mse= ((y_forecasted-y_truth) **2).mean() print('The Mean Squared Error of our forecasts is{}'.format(round(mse, 2))) #%% print('The Root Mean Squared Error of our forecasts is{}'.format(round(np.sqrt(mse), 2))) #%% md Instatistics, themeansquarederror (MSE) ofanestimatormeasurestheaverageofthesquaresoftheerrorsthatis, theaveragesquareddifferencebetweentheestimatedvaluesandwhatisestimated. TheMSEisameasureofthequalityofanestimatoritisalwaysnon-negative, andthesmallertheMSE, thecloserwearetofindingthelineofbestfit. RootMeanSquareError (RMSE) tellsusthatourmodelwasabletoforecasttheaveragedailyfurnituresalesinthetestsetwithin151.64oftherealsales. Ourfurnituredailysalesrangefromaround400toover1200.Inmyopinion, thisisaprettygoodmodelsofar. #%% md ## Producing and visualizing forecasts #%% pred_uc=results.get_forecast(steps=100) pred_ci=pred_uc.conf_int() ax=y.plot(label='observed', figsize=(14, 7)) pred_uc.predicted_mean.plot(ax=ax, label='Forecast') ax.fill_between(pred_ci.index, pred_ci.iloc[:, 0], pred_ci.iloc[:, 1], color='k', alpha=.25) ax.set_xlabel('Date') ax.set_ylabel('Furniture Sales') plt.legend() plt.show() #%% md Ourmodelclearlycapturedfurnituresalesseasonality. Asweforecastfurtheroutintothefuture, itisnaturalforustobecomelessconfidentinourvalues. Thisisreflectedbytheconfidenceintervalsgeneratedbyourmodel, whichgrowlargeraswemovefurtheroutintothefuture. #%% md Theabovetimeseriesanalysisforfurnituremakesmecuriousaboutothercategories, andhowdotheycomparewitheachotheronvertime. Therefore, wearegoingtocomparetimeseriesoffurnitureandofficesupplier. #%% md ## Time Series comparison furniture sales and Office Supplies ### Data Preprocessing #%% furniture=df.loc[df['Category'] =='Furniture'] office=df.loc[df['Category'] =='Office Supplies'] #%% md Accordingtoourdata, therewerewaymorenumberofsalesfromOfficeSuppliesthanfromFurnitureovertheyears. #%% furniture.shape, office.shape#%% cols= ['Row ID', 'Order ID', 'Ship Date', 'Ship Mode', 'Customer ID', 'Customer Name', 'Segment', 'Country', 'City', 'State', 'Postal Code', 'Region', 'Product ID', 'Category', 'Sub-Category', 'Product Name', 'Quantity', 'Discount', 'Profit'] furniture.drop(cols, axis=1, inplace=True) office.drop(cols, axis=1, inplace=True) furniture=furniture.sort_values('Order Date') office=office.sort_values('Order Date') furniture=furniture.groupby('Order Date')['Sales'].sum().reset_index() office=office.groupby('Order Date')['Sales'].sum().reset_index() #%% md Haveaquickpeek, perfect! #%% furniture.head() #%% office.head() #%% md ### Data exploration Wearegoingtocomparetwocategories' sales in the same time period. This means combine two data frames into one and plot these two categories'timeseriesintooneplot. #%% furniture=furniture.set_index('Order Date') office=office.set_index('Order Date') y_furniture=furniture['Sales'].resample('MS').mean() y_office=office['Sales'].resample('MS').mean() furniture=pd.DataFrame({'Order Date':y_furniture.index, 'Sales':y_furniture.values}) office=pd.DataFrame({'Order Date': y_office.index, 'Sales': y_office.values}) store=furniture.merge(office, how='inner', on='Order Date') store.rename(columns={'Sales_x': 'furniture_sales', 'Sales_y': 'office_sales'}, inplace=True) store.head() #%% plt.figure(figsize=(20, 8)) plt.plot(store['Order Date'], store['furniture_sales'], 'b-', label='furniture') plt.plot(store['Order Date'], store['office_sales'], 'r-', label='office supplies') plt.xlabel('Date'); plt.ylabel('Sales'); plt.title('Sales of Furniture and Office Supplies') plt.legend(); #%% md Weobservethatsalesoffurnitureandofficesuppliessharedasimilarseasonalpattern. Earlyoftheyearistheoffseasonforbothofthetwocategories. Itseemssummertimeisquietforofficesuppliestoo. inaddition, averagedailysalesforfurniturearehigherthanthoseofofficesuppliesinmostofthemonths. Itisunderstandable, asthevalueoffurnitureshouldbemuchhigherthanthoseofofficesupplies. Occationaly, officesuppliespassedfurnitueonaveragedailysales. Let's find out when was the first time office supplies'salessurpassedthoseoffurniture's. #%% first_date=store.ix[np.min(list(np.where(store['office_sales'] >store['furniture_sales'])[0])), 'Order Date'] print("Office supplies first time produced higher sales than furniture is{}.".format(first_date.date())) #%% md ItwasJuly2014.#%% md ### Time Series Modeling with Prophet ReleasedbyFacebookin2017, forecastingtoolProphetisdesignedforanalyzingtime-seriesthatdisplaypatternsondifferenttimescalessuchasyearly, weeklyanddaily. Italsohasadvancedcapabilitiesformodelingtheeffectsofholidaysonatime-seriesandimplementingcustomchangepoints. Therefore, weareusingProphettogetamodelupandrunning. #%% fromfbprophetimportProphetfurniture=furniture.rename(columns={'Order Date': 'ds', 'Sales': 'y'}) furniture_model=Prophet(interval_width=0.95) furniture_model.fit(furniture) office=office.rename(columns={'Order Date': 'ds', 'Sales': 'y'}) office_model=Prophet(interval_width=0.95) office_model.fit(office) #%% furniture_forecast=furniture_model.make_future_dataframe(periods=36, freq='MS') furniture_forecast=furniture_model.predict(furniture_forecast) office_forecast=office_model.make_future_dataframe(periods=36, freq='MS') office_forecast=office_model.predict(office_forecast) #%% plt.figure(figsize=(18, 6)) furniture_model.plot(furniture_forecast, xlabel='Date', ylabel='Sales') plt.title('Furniture Sales'); #%% plt.figure(figsize=(18, 6)) office_model.plot(office_forecast, xlabel='Date', ylabel='Sales') plt.title('Office Supplies Sales'); #%% md ### Compare Forecasts Wealreadyhavetheforecastsforthreeyearsforthesetwocategoriesintothefuture. Wewillnowjointhemtogethertocomparetheirfutureforecasts. #%% furniture_names= ['furniture_%s' %columnforcolumninfurniture_forecast.columns] office_names= ['office_%s' %columnforcolumninoffice_forecast.columns] merge_furniture_forecast=furniture_forecast.copy() merge_office_forecast=office_forecast.copy() merge_furniture_forecast.columns=furniture_namesmerge_office_forecast.columns=office_namesforecast=pd.merge(merge_furniture_forecast, merge_office_forecast, how='inner', left_on='furniture_ds', right_on='office_ds') forecast=forecast.rename(columns={'furniture_ds': 'Date'}).drop('office_ds', axis=1) forecast.head() #%% md ### Visualizing the trend and the forecast #%% plt.figure(figsize=(10, 7)) plt.plot(forecast['Date'], forecast['furniture_trend'], 'b-') plt.plot(forecast['Date'], forecast['office_trend'], 'r-') plt.legend(); plt.xlabel('Date'); plt.ylabel('Sales') plt.title('Furniture vs. Office Supplies Sales Trend'); #%% plt.figure(figsize=(10, 7)) plt.plot(forecast['Date'], forecast['furniture_yhat'], 'b-') plt.plot(forecast['Date'], forecast['office_yhat'], 'r-') plt.legend(); plt.xlabel('Date'); plt.ylabel('Sales') plt.title('Furniture vs. Office Supplies Estimate'); #%% md ### Trends and Patterns Now, wecanusetheProphetModelstoinspectdifferenttrendsofthesetwocategoriesinthedata. #%% furniture_model.plot_components(furniture_forecast); #%% office_model.plot_components(office_forecast); #%% md Goodtoseethatthesalesforbothfurnitureandofficesupplieshavebeenlinearlyincreasingovertimealthoughofficesupplies' growthseemsslightlystronger. TheworstmonthforfurnitureisApril, theworstmonthforofficesuppliesisFebruary. ThebestmonthforfurnitureisDecember, andthebestmonthforofficesuppliesisNovember. Therearemanytime-seriesanalysiswecanexplorefromnowon, suchasforecastwithuncertaintybounds, changepointandanomalydetection, forecasttime-serieswithexternaldatasource. Wehaveonlyscratchedthesurfacehere. Staytunedforfutureworksontime-seriesanalysis. #%% #%%

Boucles et conditions

 [st="x is same as y"else: st="x is greater than y"print (st) # conditional statements let you use "a if C else b"st="x is less than y"if (x<y) else"x is greater than or equal to y"print (st)](<```python## Example file for working with conditional statements#defmain(): x, y=10, 100# conditional flow uses if, elif, else if(x%3Cy): st="x is less than y"elif (x==y): st="x is same as y"else: st="x is greater than y"print (st) # conditional statements let you use "a if C else b"st="x is less than y"if (x<y) else"x is greater than or equal to y"print (st) # Python does not have support for higher-order conditionals# like "switch-case" in other languagesif__name__=="__main__": main() 
## Example file for working with functions## define a basic functiondeffunc1(): print ("I am a function") # function that takes argumentsdeffunc2(arg1, arg2): print (arg1, " ", arg2) # function that returns a valuedefcube(x): returnx*x*x# function with default value for an argumentdefpower(num, x=1): result=1; foriinrange(x): result=result*numreturnresult [[function]] withvariablenumberofargumentsdefmulti_add(*args): result=0; forxinargs: result=result+xreturnresultfunc1() print (func1()) print (func1) func2(10,20) print (func2(10,20)) print (cube(3)) print (power(2)) print (power(2,3)) print (power(x=3, num=2)) print (multi_add(4,5,10,4))
I am a function I am a function None <function func1 at 0x108d1cc80%3E 10 20 10 20 None 27 2 8 8 23 

Example file for working with classes

def main():

if name == "main": main()

## Example file for working with classes#defmain(): if__name__=="__main__": main()

Declare a variable and initialize it

f = 0 print (f)

re-declaring the variable works

f = "abc" print (f)

ERROR: variables of different types cannot be combined

[[print]] ("string type " + 123) print ("string type " + str(123))

Global vs. local variables in functions

def someFunction(): [[global]] f f = "def" print (f)

someFunction() print (f)

del f print (f)

## Example file for working with loops#defmain(): x=0# define a while loopwhile (x<5): print (x) x=x+1# define a for loopforxinrange(5,10): print (x) # use a for loop over a collectiondays= ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"] fordindays: print (d) # use the break and continue statementsforxinrange(5,10): [[if]] (x==7): break [[if]] (x%2==0): continueprint (x) [[using]] theenumerate() functiontogetindexdays= ["Mon","Tue","Wed","Thu","Fri","Sat","Sun"] fori, dinenumerate(days): print (i, d) if__name__=="__main__": main()
## Read and write files using the built-in Python file methods#defmain(): # Open a file for writing and create it if it doesn't existf=open("info_pdo.csv","w+") # Open the file for appending text to the end# f = open("textfile.txt","a+")# write some lines of data to the fileforiinrange(10): f.write("This is line %d\r\n"% (i+1)) # close the file when donef.close() # Open the file back up and read the contentsf=open("info_pdo.csv","r") iff.mode=='r': # check to make sure that the file was opened# use the read() function to read the entire file# contents = f.read()# print (contents)fl=f.readlines() # readlines reads the individual lines into a listforxinfl: print (x) if__name__=="__main__": main()
desjopen("info_pdo.csv","w+")
pwd
desj=open("info_pdo.csv","w+")
csvdesj=csvreader(desj)
desj(head)
head?
importcsvcr=csv.reader(open("info_pdo.csv","rb")) forrowincr: print(row)
importcsvwithopen('info_pdo.csv', newline='') ascsvfile: desj=csv.reader(csvfile, delimiter=';', quotechar='|') forrowindesj: print('; '.join(row))
importcsvdesj=open("info_pdo.csv","w+") desjCSV=csv.reader(desj)
importpandasaspddf1=pd.read_csv("info_pdo.csv")

presidental debate

```python !pipinstalltweepy

1. Authenticate to Twitter

# Import tweepy to work with the twitter APIimporttweepyastw# Import numpy and pandas to work with dataframesimportnumpyasnpimportpandasaspd# Import seaborn and matplotlib for vizfrommatplotlibimportpyplotasplt
consumer_key=''consumer_secret=''access_token=''access_token_secret=''
# Authenticateauth=tw.OAuthHandler(consumer_key, consumer_secret) # Set Tokensauth.set_access_token(access_token, access_token_secret) # Instantiate APIapi=tw.API(auth, wait_on_rate_limit=True)

2. Get Tweets

hashtag="#presidentialdebate"query=tw.Cursor(api.search, q=hashtag).items(1000) tweets= [{'Tweet':tweet.text, 'Timestamp':tweet.created_at} fortweetinquery] print(tweets)
df=pd.DataFrame.from_dict(tweets) df.head()
trump_handle= ['DonaldTrump', 'Donald Trump', 'Donald', 'Trump', 'Trump\'s'] biden_handle= ['JoeBiden', 'Joe Biden', 'Joe', 'Biden', 'Biden\'s']
defidentify_subject(tweet, refs): flag=0forrefinrefs: iftweet.find(ref) !=-1: flag=1returnflagdf['Trump'] =df['Tweet'].apply(lambdax: identify_subject(x, trump_handle)) df['Biden'] =df['Tweet'].apply(lambdax: identify_subject(x, biden_handle)) df.head(10)

3. Preprocess

# Import stopwordsimportnltkfromnltk.corpusimportstopwords# Import textblobfromtextblobimportWord, TextBlob
nltk.download('stopwords') nltk.download('wordnet') stop_words=stopwords.words('english') custom_stopwords= ['RT', '#PresidentialDebate']
defpreprocess_tweets(tweet, custom_stopwords): processed_tweet=tweetprocessed_tweet.replace('[^\w\s]', '') processed_tweet=" ".join(wordforwordinprocessed_tweet.split() ifwordnotinstop_words) processed_tweet=" ".join(wordforwordinprocessed_tweet.split() ifwordnotincustom_stopwords) processed_tweet=" ".join(Word(word).lemmatize() forwordinprocessed_tweet.split()) return(processed_tweet) df['Processed Tweet'] =df['Tweet'].apply(lambdax: preprocess_tweets(x, custom_stopwords)) df.head()
print('Base review\n', df['Tweet'][0]) print('\n------------------------------------\n') print('Cleaned and lemmatized review\n', df['Processed Tweet'][0])

4. Calculate Sentiment

# Calculate polaritydf['polarity'] =df['Processed Tweet'].apply(lambdax: TextBlob(x).sentiment[0]) df['subjectivity'] =df['Processed Tweet'].apply(lambdax: TextBlob(x).sentiment[1]) df[['Processed Tweet', 'Biden', 'Trump', 'polarity', 'subjectivity']].head()
display(df[df['Trump']==1][['Trump','polarity','subjectivity']].groupby('Trump').agg([np.mean, np.max, np.min, np.median])) df[df['Biden']==1][['Biden','polarity','subjectivity']].groupby('Biden').agg([np.mean, np.max, np.min, np.median])

5. Visualise

biden=df[df['Biden']==1][['Timestamp', 'polarity']] biden=biden.sort_values(by='Timestamp', ascending=True) biden['MA Polarity'] =biden.polarity.rolling(10, min_periods=3).mean() trump=df[df['Trump']==1][['Timestamp', 'polarity']] trump=trump.sort_values(by='Timestamp', ascending=True) trump['MA Polarity'] =trump.polarity.rolling(10, min_periods=3).mean()
trump.head()
repub='red'demo='blue'fig, axes=plt.subplots(2, 1, figsize=(13, 10)) axes[0].plot(biden['Timestamp'], biden['MA Polarity']) axes[0].set_title("\n".join(["Biden Polarity"])) axes[1].plot(trump['Timestamp'], trump['MA Polarity'], color='red') axes[1].set_title("\n".join(["Trump Polarity"])) fig.suptitle("\n".join(["Presidential Debate Analysis"]), y=0.98) plt.show()

About

No description, website, or topics provided.

Resources

Stars

Watchers

Forks

Sponsor this project

  •  

Packages

No packages published

Languages