Sentiment Analysis on Farmer 's Protest Using RNN-LSTM
Dataset : https://drive.google.com/drive/folders/1vcFqYVicEA665l7ezUSYIY7f-fj6u0W7?usp=sharing
Reference:
Ashwin Sanjay Neogi, Kirti Anilkumar Garg, Ram Krishn Mishra, Yogesh K Dwivedi, "Sentiment analysis and classification of Indian farmers’ protest using twitter data", International Journal of Information Management Data Insights, Volume 1, Issue 2, 2021, 100019, ISSN 2667-0968, https://doi.org/10.1016/j.jjimei.2021.100019.
Dataset Description:
A total of 18,000 tweets have been collected over a period of four months.
Since the farmers’ protest began around the month of November 2020, we chose the starting date as 5th November and the ending date as 5th March 2021. The DateTime library was incorporated for a customized script which was written to explicitly retrieve 150 tweets per day and used to store them in a python list
We used the keyword ‘farmers protest’ as a search query wherein all the tweets which contained the words “farmers”, “protest”, and “farmers protest” were conglomerated together
Importing Required Library
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re
!pip install twython
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
Reading Dataset
# Import the data file
df = pd.read_csv("Tweets.csv")
#df = df[1:number+1]
df = df['Text']
df = pd.DataFrame({'Text':df})
# Clean the tweets
def cleantext(df):
### dont change the original tweet
df['cleaned_tweet'] = df['Text'].replace(" ", " ")
#remove user mentions
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(@\w+)',"", regex=True)
#remove 'rt' in the beginning
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^(rt @)',"", regex=True)
#remove_URL(x):
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https.*$', "", regex = True)
#remove 'amp' in the text
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'amp',"", regex = True)
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'https',"", regex = True)
#remove extra spaces in the tweet
df['cleaned_tweet'] = df['cleaned_tweet'].replace(r'^\s+|\s+$'," ", regex=True)
return df
#get the processed tweets
df = cleantext(df)
Print Columns
print(df.columns)
Finding Sentiment Score
df['scores'] = df['cleaned_tweet'].apply(lambda Description: sid.polarity_scores(Description))
df.head()
Compound Score
df['compound'] = df['scores'].apply(lambda score_dict: score_dict['compound'])
df['sentiment']=''
df.loc[df.compound>0,'sentiment']='Positive'
df.loc[df.compound==0,'sentiment']='Neutral'
df.loc[df.compound<0,'sentiment']='Negative'
df.head()
Count the Sentiment Value
print(df['sentiment'].value_counts())
Create new Dataframe
new_df = df[['cleaned_tweet','sentiment']]
print(new_df.sentiment)
Removing Neutral Sentiments
new_df = new_df[new_df.sentiment != "Neutral"]
new_df.head()
Tokenization
tokenizer = Tokenizer(num_words=1500, split=' ')
tokenizer.fit_on_texts(new_df['cleaned_tweet'].values)
X = tokenizer.texts_to_sequences(new_df['cleaned_tweet'])
X = pad_sequences(X)
Model Building using RNN-LSTM
embed_dim = 128
lstm_out = 196
model = Sequential()
model.add(Embedding(10018, embed_dim,input_length = 28))
model.add(LSTM(lstm_out, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(2,activation='softmax'))
model.compile(loss = 'sparse_categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
from sklearn.preprocessing import LabelEncoder
Le = LabelEncoder()
y = Le.fit_transform(new_df['sentiment'])
X_train, X_test, Y_train, y_test = train_test_split(X,y, test_size = 0.15, random_state = 42)
len(y)
print(Y_train)
history = model.fit(X_train, Y_train,validation_data = (X_test,y_test),epochs = 5, batch_size=32)
model.evaluate(X_test,y_test)
print("Prediction: ",model.predict_classes(X_test[5:10]))
print("Actual: \n",y_test[5:10])
import matplotlib.pyplot as plt
plt.figure(figsize = (10,8))
print(history.history.keys())
# summarize history for accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
# summarize history for loss
plt.figure(figsize = (10,8))
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()
model_predict = model.predict_classes(X_test)
model_predict_df = pd.DataFrame(model_predict)
model_predict_df = model_predict_df.rename(columns={0: 'Labels'})
model_predict_df['Labels'] = model_predict_df['Labels'].astype('str')
model_predict_df['Labels'] = model_predict_df['Labels'].str.replace('1', 'Positive')
model_predict_df['Labels'] = model_predict_df['Labels'].str.replace('0', 'Negative')
model_predict_df.head()
import seaborn as sns
plt.figure(figsize = (10,6))
sns.countplot(model_predict_df['Labels'])
plt.title("Bar Plot of Predicted Sentiments",fontsize = 15)
plt.xlabel("Predicted Sentiment Labels",fontsize = 15)
plt.ylabel("Count",fontsize = 15)
plt.show()
model_predict_df['Labels'].value_counts()
from sklearn.metrics import classification_report
print(classification_report(y_test,model_predict))