SMA EXPERIMENTS

SMA Experiments 
EXP - 3

Aim - Social Media Network Analytics Basics (Closeness Centrality , Degree Centrality)
code - 
import networkx as nx # Import networkx at the beginning of the script
import matplotlib.pyplot as plt
G=nx.Graph()
a, b, c = 'a', 'b', 'c'  # Define a, b, and c
G.add_nodes_from([a,b,c])
G.nodes()
G.add_edge(a,b)
G.add_edge(a,c)
G.edges()
nx.draw(G, with_labels=True) # Add with_labels=True to display node labels
plt.show()

# New
!pip install networkx # Install the networkx library
import networkx as nx # Import the library with the correct name

#New
nx.degree(G)

#New
nx.degree_centrality(G)

#New
nx.shortest_path(G, source='a', target='c')  # Specify a function within the module and existing nodes

#New
import matplotlib.pyplot as plt
L=nx.Graph()
a, b, c,d,e,f,g = 'a', 'b', 'c','d','e','f','g' # Define a, b, and c
L.add_nodes_from([a,b,c,d,e,f,g])
L.nodes()
L.add_edge(a,b)
L.add_edge(a,e)
L.add_edge(b,e)
L.add_edge(b,c)
L.add_edge(b,d)
L.add_edge(b,f)
L.add_edge(d,e)
L.add_edge(d,g)
L.add_edge(e,f)
L.add_edge(e,g)

L.edges()
nx.draw(L, with_labels=True) # Add with_labels=True to display node labels
plt.show()

# New
nx.betweenness_centrality(L)

# New
G=nx.gnp_random_graph(10, 0.5)
nx.draw(G, with_labels=1)
plt.show()

# New
nx.clustering(G)


EXP - 4
Aim - Scrape YouTube comments with the youtube api and do sentiments analysis      

import requests

video_id = "bw7bVpI5VcM"

api_key = "AIzaSyC6ZHngVmQRc6VdyF4cY0F-wL68X2uzZvk"

# Retrieve video information
video_info_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"
video_info_response = requests.get(video_info_url)
video_info_data = video_info_response.json()

video_info_data

# Retrieve video compnts
comments_url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}"
comments_response= requests.get(comments_url)
comments_data = comments_response.json()

comments_data

# Extract the carnents
comments = [item["snippet"]["topLevelComment"]["snippet"]["textOriginal"] for item in comments_data["items"]]

print(comments)

from textblob import TextBlob


def get_comment_sentiment(comment):
  analysis = TextBlob(comment)
  if analysis.sentiment.polarity > 0:
    return "Positive"
  elif analysis.sentiment.polarity == 0:
    return "neutral"
  else:
    return "negative"

comment_list = []
sentiment_list = []
for comment in comments:
  sentiment = get_comment_sentiment(comment)
  comment_list.append(comment)
  sentiment_list.append(sentiment)
  print(f"{comment} : {sentiment}")

import pandas as pd

sentiment_df = pd.DataFrame({"Comments": comment_list,"Sentiment": sentiment_list})

sentiment_df.head()

sentiment_df.to_csv("YouTube_Comments_Sentiment.csv")

!pip install boilerpipe3


from boilerpipe.extract import Extractor
URL="https://www.amazon.in/s?k=grass+mat+for+baby&crid=3FKEPQH8TQDJ1&sprefix=grass+mat+for+baby%2Caps%2C211&ref=nb_sb_noss_1"
extrcator=Extractor(extractor='ArticleExtractor',url=URL)
print(extrcator.getText())

!pip install feedparser

import feedparser

FEED_URL="http://feeds.feedburner.com/oreilly/radar/atom"
fp=feedparser.parse(FEED_URL)
for e in fp.entries:
  print(e.title)
  print(e.links[0].href)
  print(e.content[0].value)



Exp -5 
Aim - Twitter Sentiment Analysis

**1. Scrape Twitter Data for chagGPT4 Tweets**

!pip install snscrape

import pandas as pd
import snscrape.modules.twitter as sntwitter
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

import string
import re
import textblob
from textblob import TextBlob

from wordcloud import WordCloud, STOPWORDS

from wordcloud import ImageColorGenerator

import warnings
%matplotlib inline

import os

# Using OS library to call CLI commands in Python
os.system("snscrape --jsonl --max-results 10000 --since 2023-03-13 twitter-search 'chatGPT4' > text-chatGPT4-tweets.json")

import pandas as pd

# creates a pandas dataframe
tweets_df_chatGPT4 = pd.read_json('text-chatGPT4-tweets.json', lines=True)


tweets_df_chatGPT4.head()


**2. Data Loading**

df_chatGPT4 = tweets_df_chatGPT4[['date', 'rawContent','renderedContent','user','replyCount','retweetCount','likeCount','lang','place','hashtags','viewCount']].copy()

print(df_chatGPT4.shape)

**3. Twitter Data Cleaning , Preprocessing and Exploratory Data Analysis**

df2=df_chatGPT4.drop_duplicates('renderedContent')

print(df2.shape)

df2.head()

df2.info()

df2.date.value_counts()

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

#Heat Map for missing values
plt.figure(figsize=(17, 5))
sns.heatmap(df2.isnull(), cbar=True, yticklabels=False)
plt.xlabel("Column_Name", size=14, weight="bold")
plt.title("Places of missing values in column",fontweight="bold",size=17)
plt.show()

import plotly.graph_objects as go
Top_Location_Of_tweet= df2['place'].value_counts().head(10)

print (Top_Location_Of_tweet)

import nltk

stop=nltk.download('stopwords')

**Twitter Data Cleaning and Preprocessing**

def pre_process(text):
    # Remove links
    text = re.sub('http://\S+|https://\S+', '', text)
    text = re.sub('http[s]?://\S+', '', text)
    text = re.sub(r"http\S+", "", text)

    # Convert HTML references
    text = re.sub('&amp', 'and', text)
    text = re.sub('&lt', '<', text)
    text = re.sub('&gt', '>', text)
  
    # Remove new line characters
    text = re.sub('[\r\n]+', ' ', text)
    
    # Remove mentions
    text = re.sub(r'@\w+', '', text)
    
    # Remove hashtags
    text = re.sub(r'#\w+', '', text)

    # Remove multiple space characters
    text = re.sub('\s+',' ', text)
    
    # Convert to lowercase
    text = text.lower()
    return text

df2['processed_text'] = df2['renderedContent'].apply(pre_process)

print(df2['processed_text'].head())

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

common_words = get_top_n_bigram(df2['processed_text'], 20)

print(common_words)

df3 = pd.DataFrame(common_words, columns = ['TweetText' , 'count'])

import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)

#df3.groupby('TweetText').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count',linecolor='black',title='Top 20 bigrams in Tweet before removing spams')



## **4. Sentiment Analysis**

df3.head()

df2.head()

from nltk.sentiment.vader import SentimentIntensityAnalyzer


# Instantiate new SentimentIntensityAnalyzer
sid = SentimentIntensityAnalyzer()

# Generate sentiment scores
sentiment_scores = df2['processed_text'].apply(sid.polarity_scores)

df2.head()

%matplotlib inline
plt.close('all')

df2['hashtags'].value_counts()



import matplotlib.pyplot as plt

hashtags_counts = df2['hashtags'].value_counts().head(5)
plt.figure(figsize=(10,6))
plt.pie(hashtags_counts.values, labels=hashtags_counts.index, autopct='%1.1f%%')
plt.title('Hashtags Counts')
plt.show()

#df2['lang'].value_counts()

import matplotlib.pyplot as plt

lang_counts = df2['lang'].value_counts().head(10)
plt.figure(figsize=(10,6))
plt.bar(lang_counts.index, lang_counts.values)
plt.title('Language Counts')
plt.xlabel('Language')
plt.ylabel('Count')
plt.show()


df2['country'] = df2['place'].apply(lambda x: x['country'] if x else None)
df2['countryCode'] = df2['place'].apply(lambda x: x['countryCode'] if x else None)

top_location_of_tweet = df2['country'].value_counts()
top_location_of_tweet.head(20)


import matplotlib.pyplot as plt

country_counts = df2['country'].value_counts().head(10)
plt.figure(figsize=(10,6))
plt.bar(country_counts.index, country_counts.values)
plt.title('Country Counts')
plt.xlabel('Country')
plt.ylabel('Count')
plt.show()

df2['date_column'] = df2['date'].dt.date

#Number of Tweets per day

# group the data by day and count the number of tweets per day
daily_counts = df2.groupby(df2['date'].dt.date).count()

# create a line chart of the daily tweet counts
plt.figure(figsize=(10,6))
plt.plot(daily_counts.index, daily_counts.values)
plt.title('Daily Tweet Counts')
plt.xlabel('Date')
plt.ylabel('Number of Tweets')
plt.show()


# Create a scatter plot of retweet count vs. like count
plt.scatter(df2['retweetCount'], df2['likeCount'])
plt.title('Retweet Count vs. Like Count')
plt.xlabel('Retweet Count')
plt.ylabel('Like Count')
plt.show()

!pip install textblob

import pandas as pd
from textblob import TextBlob

# Define a function to perform sentiment analysis on a tweet using TextBlob
def analyze_sentiment(tweet):
    # Create a TextBlob object for the tweet
    blob = TextBlob(tweet)
    
    # Use TextBlob to calculate the sentiment polarity of the tweet
    polarity = blob.sentiment.polarity
    
    # Return the sentiment polarity
    return polarity


# Apply the sentiment analysis function to each tweet in the DataFrame
df2['sentiment'] = df2['processed_text'].apply(analyze_sentiment)

# Print the resulting DataFrame
print(df2.head())



def classify_sentiment(polarity):
    if polarity > 0:
        return 'positive'
    elif polarity < 0:
        return 'negative'
    else:
        return 'neutral'

# Apply the classify_sentiment function to each sentiment polarity value in the DataFrame
df2['sentiment_type'] = df2['sentiment'].apply(classify_sentiment)


sentiment_counts = df2['sentiment_type'].value_counts()

# Create a bar plot of the sentiment counts
plt.bar(sentiment_counts.index, sentiment_counts.values)

# Add a title and labels for the x and y axes
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment Class')
plt.ylabel('Number of Tweets')

# Show the plot
plt.show()

**Create Word Cloud**

very_positive_tweets = df2[df2['sentiment'] > 0.3]

# display only the tweet and sentiment score columns

very_positive_tweets.head()



very_negative_tweets = df2[df2['sentiment'] < 0]

very_negative_tweets.head()



from wordcloud import WordCloud, STOPWORDS
from PIL import Image

# concatenate all the very positive tweets into a single string
all_tweets = ' '.join(very_positive_tweets['processed_text'])

# generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white', colormap='Blues').generate(all_tweets)

# plot the word cloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

import nltk
from nltk.corpus import stopwords
import string
from collections import Counter

import nltk
nltk.download('punkt')

all_words = nltk.word_tokenize(all_tweets.lower())
all_words = [word for word in all_words if word not in stopwords.words('english')]
all_words = [word for word in all_words if word not in string.punctuation]

# count the frequency of each word using a dictionary
word_freq = Counter(all_words)

# sort the words by frequency in descending order
sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# display the top 20 words with their frequencies
top_words = sorted_words[:20]
for word, freq in top_words:
    print(f'{word}: {freq}')

# concatenate all the very positive tweets into a single string
all_tweets=" "
all_tweets = ' '.join(very_negative_tweets['processed_text'])

# generate the word cloud
wordcloud = WordCloud(width=800, height=800, background_color='white', colormap='Blues').generate(all_tweets)

# plot the word cloud
plt.figure(figsize=(8, 8), facecolor=None)
plt.imshow(wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

Sentiment **Analysis**

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score


df3 = df2[['processed_text', 'sentiment_type']]

X_train, X_test, y_train, y_test = train_test_split(df3['processed_text'], df3['sentiment_type'], random_state=0)


# Convert text into numerical vectors using CountVectorizer
vectorizer = CountVectorizer(stop_words='english')
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier
clf = MultinomialNB()
clf.fit(X_train_vec, y_train)

# Predict on the test set
y_pred = clf.predict(X_test_vec)

# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

Post a Comment

0 Comments
* Please Don't Spam Here. All the Comments are Reviewed by Admin.