SMA Experiments
EXP - 3
Aim - Social Media Network Analytics Basics (Closeness Centrality , Degree Centrality)
code -
import networkx as nx # Import networkx at the beginning of the script
import matplotlib.pyplot as plt
G=nx.Graph()
a, b, c = 'a', 'b', 'c' # Define a, b, and c
G.add_nodes_from([a,b,c])
G.nodes()
G.add_edge(a,b)
G.add_edge(a,c)
G.edges()
nx.draw(G, with_labels=True) # Add with_labels=True to display node labels
plt.show()
# New
!pip install networkx # Install the networkx library
import networkx as nx # Import the library with the correct name
#New
nx.degree(G)
#New
nx.degree_centrality(G)
#New
nx.shortest_path(G, source='a', target='c') # Specify a function within the module and existing nodes
#New
import matplotlib.pyplot as plt
L=nx.Graph()
a, b, c,d,e,f,g = 'a', 'b', 'c','d','e','f','g' # Define a, b, and c
L.add_nodes_from([a,b,c,d,e,f,g])
L.nodes()
L.add_edge(a,b)
L.add_edge(a,e)
L.add_edge(b,e)
L.add_edge(b,c)
L.add_edge(b,d)
L.add_edge(b,f)
L.add_edge(d,e)
L.add_edge(d,g)
L.add_edge(e,f)
L.add_edge(e,g)
L.edges()
nx.draw(L, with_labels=True) # Add with_labels=True to display node labels
plt.show()
# New
nx.betweenness_centrality(L)
# New
G=nx.gnp_random_graph(10, 0.5)
nx.draw(G, with_labels=1)
plt.show()
# New
nx.clustering(G)
EXP - 4
Aim - Scrape YouTube comments with the youtube api and do sentiments analysis
import requests
video_id = "bw7bVpI5VcM"
api_key = "AIzaSyC6ZHngVmQRc6VdyF4cY0F-wL68X2uzZvk"
# Retrieve video information
video_info_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
video_info_response = requests.get(video_info_url)
video_info_data = video_info_response.json()
video_info_data
# Retrieve video compnts
comments_url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}"
comments_response= requests.get(comments_url)
comments_data = comments_response.json()
comments_data
# Extract the carnents
comments = [item["snippet"]["topLevelComment"]["snippet"]["textOriginal"] for item in comments_data["items"]]
print(comments)
from textblob import TextBlob
def get_comment_sentiment(comment):
analysis = TextBlob(comment)
if analysis.sentiment.polarity > 0:
return "Positive"
elif analysis.sentiment.polarity == 0:
return "neutral"
else:
return "negative"
comment_list = []
sentiment_list = []
for comment in comments:
sentiment = get_comment_sentiment(comment)
comment_list.append(comment)
sentiment_list.append(sentiment)
print(f"{comment} : {sentiment}")
import pandas as pd
sentiment_df = pd.DataFrame({"Comments": comment_list,"Sentiment": sentiment_list})
sentiment_df.head()
sentiment_df.to_csv("YouTube_Comments_Sentiment.csv")
!pip install boilerpipe3
from boilerpipe.extract import Extractor
URL="https://www.amazon.in/s?k=grass+mat+for+baby&crid=3FKEPQH8TQDJ1&sprefix=grass+mat+for+baby%2Caps%2C211&ref=nb_sb_noss_1"
extrcator=Extractor(extractor='ArticleExtractor',url=URL)
print(extrcator.getText())
!pip install feedparser
import feedparser
FEED_URL="http://feeds.feedburner.com/oreilly/radar/atom"
fp=feedparser.parse(FEED_URL)
for e in fp.entries:
print(e.title)
print(e.links[0].href)
print(e.content[0].value)
Exp -5
Aim - Twitter Sentiment Analysis
**1. Scrape Twitter Data for chagGPT4 Tweets**
!pip install snscrape
import pandas as pd
import snscrape.modules.twitter as sntwitter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import string
import re
import textblob
from textblob import TextBlob
from wordcloud import WordCloud, STOPWORDS
from wordcloud import ImageColorGenerator
import warnings
%matplotlib inline
import os
# Using OS library to call CLI commands in Python
os.system("snscrape --jsonl --max-results 10000 --since 2023-03-13 twitter-search 'chatGPT4' > text-chatGPT4-tweets.json")
import pandas as pd
# creates a pandas dataframe
tweets_df_chatGPT4 = pd.read_json('text-chatGPT4-tweets.json', lines=True)
tweets_df_chatGPT4.head()
**2. Data Loading**
df_chatGPT4 = tweets_df_chatGPT4[['date', 'rawContent','renderedContent','user','replyCount','retweetCount','likeCount','lang','place','hashtags','viewCount']].copy()
print(df_chatGPT4.shape)
**3. Twitter Data Cleaning , Preprocessing and Exploratory Data Analysis**
df2=df_chatGPT4.drop_duplicates('renderedContent')
print(df2.shape)
df2.head()
df2.info()
df2.date.value_counts()
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
#Heat Map for missing values
plt.figure(figsize=(17, 5))
sns.heatmap(df2.isnull(), cbar=True, yticklabels=False)
plt.xlabel("Column_Name", size=14, weight="bold")
plt.title("Places of missing values in column",fontweight="bold",size=17)
plt.show()
import plotly.graph_objects as go
Top_Location_Of_tweet= df2['place'].value_counts().head(10)
print (Top_Location_Of_tweet)
import nltk
stop=nltk.download('stopwords')
**Twitter Data Cleaning and Preprocessing**
def pre_process(text):
# Remove links
text = re.sub('http://\S+|https://\S+', '', text)
text = re.sub('http[s]?://\S+', '', text)
text = re.sub(r"http\S+", "", text)
# Convert HTML references
text = re.sub('&', 'and', text)
text = re.sub('<', '<', text)
text = re.sub('>', '>', text)
# Remove new line characters
text = re.sub('[\r\n]+', ' ', text)
# Remove mentions
text = re.sub(r'@\w+', '', text)
# Remove hashtags
text = re.sub(r'#\w+', '', text)
# Remove multiple space characters
text = re.sub('\s+',' ', text)
# Convert to lowercase
text = text.lower()
return text
df2['processed_text'] = df2['renderedContent'].apply(pre_process)
print(df2['processed_text'].head())
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
def get_top_n_bigram(corpus, n=None):
vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
bag_of_words = vec.transform(corpus)
sum_words = bag_of_words.sum(axis=0)
words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
return words_freq[:n]
common_words = get_top_n_bigram(df2['processed_text'], 20)
print(common_words)
df3 = pd.DataFrame(common_words, columns = ['TweetText' , 'count'])
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
#df3.groupby('TweetText').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count',linecolor='black',title='Top 20 bigrams in Tweet before removing spams')
## **4. Sentiment Analysis**
df3.head()
df2.head()
from nltk.sentiment.vader import SentimentIntensityAnalyzer
# Instantiate new SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()
# Generate sentiment scores
sentiment_scores = df2['processed_text'].apply(sid.polarity_scores)
df2.head()
%matplotlib inline
plt.close('all')
df2['hashtags'].value_counts()
import matplotlib.pyplot as plt
hashtags_counts = df2['hashtags'].value_counts().head(5)
plt.figure(figsize=(10,6))
plt.pie(hashtags_counts.values, labels=hashtags_counts.index, autopct='%1.1f%%')
plt.title('Hashtags Counts')
plt.show()
#df2['lang'].value_counts()
import matplotlib.pyplot as plt
lang_counts = df2['lang'].value_counts().head(10)
plt.figure(figsize=(10,6))
plt.bar(lang_counts.index, lang_counts.values)
plt.title('Language Counts')
plt.xlabel('Language')
plt.ylabel('Count')
plt.show()
df2['country'] = df2['place'].apply(lambda x: x['country'] if x else None)
df2['countryCode'] = df2['place'].apply(lambda x: x['countryCode'] if x else None)
top_location_of_tweet = df2['country'].value_counts()
top_location_of_tweet.head(20)
import matplotlib.pyplot as plt
country_counts = df2['country'].value_counts().head(10)
plt.figure(figsize=(10,6))
plt.bar(country_counts.index, country_counts.values)
plt.title('Country Counts')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()
df2['date_column'] = df2['date'].dt.date
#Number of Tweets per day
# group the data by day and count the number of tweets per day
daily_counts = df2.groupby(df2['date'].dt.date).count()
# create a line chart of the daily tweet counts
plt.figure(figsize=(10,6))
plt.plot(daily_counts.index, daily_counts.values)
plt.title('Daily Tweet Counts')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.show()
# Create a scatter plot of retweet count vs. like count
plt.scatter(df2['retweetCount'], df2['likeCount'])
plt.title('Retweet Count vs. Like Count')
plt.xlabel('Retweet Count')
plt.ylabel('Like Count')
plt.show()
!pip install textblob
import pandas as pd
from textblob import TextBlob
# Define a function to perform sentiment analysis on a tweet using TextBlob
def analyze_sentiment(tweet):
# Create a TextBlob object for the tweet
blob = TextBlob(tweet)
# Use TextBlob to calculate the sentiment polarity of the tweet
polarity = blob.sentiment.polarity
# Return the sentiment polarity
return polarity
# Apply the sentiment analysis function to each tweet in the DataFrame
df2['sentiment'] = df2['processed_text'].apply(analyze_sentiment)
# Print the resulting DataFrame
print(df2.head())
def classify_sentiment(polarity):
if polarity > 0:
return 'positive'
elif polarity < 0:
return 'negative'
else:
return 'neutral'
# Apply the classify_sentiment function to each sentiment polarity value in the DataFrame
df2['sentiment_type'] = df2['sentiment'].apply(classify_sentiment)
sentiment_counts = df2['sentiment_type'].value_counts()
# Create a bar plot of the sentiment counts
plt.bar(sentiment_counts.index, sentiment_counts.values)
# Add a title and labels for the x and y axes
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment Class')
plt.ylabel('Number of Tweets')
# Show the plot
plt.show()
**Create Word Cloud**
very_positive_tweets = df2[df2['sentiment'] > 0.3]
# display only the tweet and sentiment score columns
very_positive_tweets.head()
very_negative_tweets = df2[df2['sentiment'] < 0]
very_negative_tweets.head()
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
# concatenate all the very positive tweets into a single string
all_tweets = ' '.join(very_positive_tweets['processed_text'])
# generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white', colormap='Blues').generate(all_tweets)
# plot the word cloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
import nltk
from nltk.corpus import stopwords
import string
from collections import Counter
import nltk
nltk.download('punkt')
all_words = nltk.word_tokenize(all_tweets.lower())
all_words = [word for word in all_words if word not in stopwords.words('english')]
all_words = [word for word in all_words if word not in string.punctuation]
# count the frequency of each word using a dictionary
word_freq = Counter(all_words)
# sort the words by frequency in descending order
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)
# display the top 20 words with their frequencies
top_words = sorted_words[:20]
for word, freq in top_words:
print(f'{word}: {freq}')
# concatenate all the very positive tweets into a single string
all_tweets=" "
all_tweets = ' '.join(very_negative_tweets['processed_text'])
# generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white', colormap='Blues').generate(all_tweets)
# plot the word cloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()
Sentiment **Analysis**
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
df3 = df2[['processed_text', 'sentiment_type']]
X_train, X_test, y_train, y_test = train_test_split(df3['processed_text'], df3['sentiment_type'], random_state=0)
# Convert text into numerical vectors using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)
# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)
# Predict on the test set
y_pred = clf.predict(X_test_vec)
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)