SMA EXPERIMENTS

SMA Experiments

EXP - 3

Aim - Social Media Network Analytics Basics (Closeness Centrality , Degree Centrality)

code -

import networkx as nx # Import networkx at the beginning of the script

import matplotlib.pyplot as plt

G=nx.Graph()

a, b, c = 'a', 'b', 'c' # Define a, b, and c

G.add_nodes_from([a,b,c])

G.nodes()

G.add_edge(a,b)

G.add_edge(a,c)

G.edges()

nx.draw(G, with_labels=True) # Add with_labels=True to display node labels

plt.show()

# New

!pip install networkx # Install the networkx library

import networkx as nx # Import the library with the correct name

#New

nx.degree(G)

#New

nx.degree_centrality(G)

#New

nx.shortest_path(G, source='a', target='c') # Specify a function within the module and existing nodes

#New

import matplotlib.pyplot as plt

L=nx.Graph()

a, b, c,d,e,f,g = 'a', 'b', 'c','d','e','f','g' # Define a, b, and c

L.add_nodes_from([a,b,c,d,e,f,g])

L.nodes()

L.add_edge(a,b)

L.add_edge(a,e)

L.add_edge(b,e)

L.add_edge(b,c)

L.add_edge(b,d)

L.add_edge(b,f)

L.add_edge(d,e)

L.add_edge(d,g)

L.add_edge(e,f)

L.add_edge(e,g)

L.edges()

nx.draw(L, with_labels=True) # Add with_labels=True to display node labels

plt.show()

# New

nx.betweenness_centrality(L)

# New

G=nx.gnp_random_graph(10, 0.5)

nx.draw(G, with_labels=1)

plt.show()

# New

nx.clustering(G)

EXP - 4

Aim - Scrape YouTube comments with the youtube api and do sentiments analysis

import requests

video_id = "bw7bVpI5VcM"

api_key = "AIzaSyC6ZHngVmQRc6VdyF4cY0F-wL68X2uzZvk"

# Retrieve video information

video_info_url = f"https://www.googleapis.com/youtube/v3/videos?part=snippet&id={video_id}&key={api_key}"

video_info_response = requests.get(video_info_url)

video_info_data = video_info_response.json()

video_info_data

# Retrieve video compnts

comments_url = f"https://www.googleapis.com/youtube/v3/commentThreads?part=snippet&videoId={video_id}&key={api_key}"

comments_response= requests.get(comments_url)

comments_data = comments_response.json()

comments_data

# Extract the carnents

comments = [item["snippet"]["topLevelComment"]["snippet"]["textOriginal"] for item in comments_data["items"]]

print(comments)

from textblob import TextBlob

def get_comment_sentiment(comment):

analysis = TextBlob(comment)

if analysis.sentiment.polarity > 0:

return "Positive"

elif analysis.sentiment.polarity == 0:

return "neutral"

else:

return "negative"

comment_list = []

sentiment_list = []

for comment in comments:

sentiment = get_comment_sentiment(comment)

comment_list.append(comment)

sentiment_list.append(sentiment)

print(f"{comment} : {sentiment}")

import pandas as pd

sentiment_df = pd.DataFrame({"Comments": comment_list,"Sentiment": sentiment_list})

sentiment_df.head()

sentiment_df.to_csv("YouTube_Comments_Sentiment.csv")

!pip install boilerpipe3

from boilerpipe.extract import Extractor

URL="https://www.amazon.in/s?k=grass+mat+for+baby&crid=3FKEPQH8TQDJ1&sprefix=grass+mat+for+baby%2Caps%2C211&ref=nb_sb_noss_1"

extrcator=Extractor(extractor='ArticleExtractor',url=URL)

print(extrcator.getText())

!pip install feedparser

import feedparser

FEED_URL="http://feeds.feedburner.com/oreilly/radar/atom"

fp=feedparser.parse(FEED_URL)

for e in fp.entries:

print(e.title)

print(e.links[0].href)

print(e.content[0].value)

Exp -5

Aim - Twitter Sentiment Analysis

**1. Scrape Twitter Data for chagGPT4 Tweets**

!pip install snscrape

import pandas as pd

import snscrape.modules.twitter as sntwitter

import numpy as np

import matplotlib.pyplot as plt

import seaborn as sns

import nltk

nltk.download('stopwords')

from nltk.corpus import stopwords

from nltk.tokenize import word_tokenize

from nltk.stem import WordNetLemmatizer

from nltk.stem.porter import PorterStemmer

import string

import re

import textblob

from textblob import TextBlob

from wordcloud import WordCloud, STOPWORDS

from wordcloud import ImageColorGenerator

import warnings

%matplotlib inline

import os

# Using OS library to call CLI commands in Python

os.system("snscrape --jsonl --max-results 10000 --since 2023-03-13 twitter-search 'chatGPT4' > text-chatGPT4-tweets.json")

import pandas as pd

# creates a pandas dataframe

tweets_df_chatGPT4 = pd.read_json('text-chatGPT4-tweets.json', lines=True)

tweets_df_chatGPT4.head()

**2. Data Loading**

df_chatGPT4 = tweets_df_chatGPT4[['date', 'rawContent','renderedContent','user','replyCount','retweetCount','likeCount','lang','place','hashtags','viewCount']].copy()

print(df_chatGPT4.shape)

**3. Twitter Data Cleaning , Preprocessing and Exploratory Data Analysis**

df2=df_chatGPT4.drop_duplicates('renderedContent')

print(df2.shape)

df2.head()

df2.info()

df2.date.value_counts()

import numpy as np

import pandas as pd

import matplotlib.pyplot as plt

%matplotlib inline

import seaborn as sns

#Heat Map for missing values

plt.figure(figsize=(17, 5))

sns.heatmap(df2.isnull(), cbar=True, yticklabels=False)

plt.xlabel("Column_Name", size=14, weight="bold")

plt.title("Places of missing values in column",fontweight="bold",size=17)

plt.show()

import plotly.graph_objects as go

Top_Location_Of_tweet= df2['place'].value_counts().head(10)

print (Top_Location_Of_tweet)

import nltk

stop=nltk.download('stopwords')

**Twitter Data Cleaning and Preprocessing**

def pre_process(text):

# Remove links

text = re.sub('http://\S+|https://\S+', '', text)

text = re.sub('http[s]?://\S+', '', text)

text = re.sub(r"http\S+", "", text)

# Convert HTML references

text = re.sub('&amp', 'and', text)

text = re.sub('&lt', '<', text)

text = re.sub('&gt', '>', text)

# Remove new line characters

text = re.sub('[\r\n]+', ' ', text)

# Remove mentions

text = re.sub(r'@\w+', '', text)

# Remove hashtags

text = re.sub(r'#\w+', '', text)

# Remove multiple space characters

text = re.sub('\s+',' ', text)

# Convert to lowercase

text = text.lower()

return text

df2['processed_text'] = df2['renderedContent'].apply(pre_process)

print(df2['processed_text'].head())

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

def get_top_n_bigram(corpus, n=None):

vec = CountVectorizer(ngram_range=(2, 2), stop_words='english').fit(corpus)

bag_of_words = vec.transform(corpus)

sum_words = bag_of_words.sum(axis=0)

words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]

words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)

return words_freq[:n]

common_words = get_top_n_bigram(df2['processed_text'], 20)

print(common_words)

df3 = pd.DataFrame(common_words, columns = ['TweetText' , 'count'])

import cufflinks as cf

cf.go_offline()

cf.set_config_file(offline=False, world_readable=True)

#df3.groupby('TweetText').sum()['count'].sort_values(ascending=False).iplot(kind='bar', yTitle='Count',linecolor='black',title='Top 20 bigrams in Tweet before removing spams')

## **4. Sentiment Analysis**

df3.head()

df2.head()

from nltk.sentiment.vader import SentimentIntensityAnalyzer

# Instantiate new SentimentIntensityAnalyzer

sid = SentimentIntensityAnalyzer()

# Generate sentiment scores

sentiment_scores = df2['processed_text'].apply(sid.polarity_scores)

df2.head()

%matplotlib inline

plt.close('all')

df2['hashtags'].value_counts()

import matplotlib.pyplot as plt

hashtags_counts = df2['hashtags'].value_counts().head(5)

plt.figure(figsize=(10,6))

plt.pie(hashtags_counts.values, labels=hashtags_counts.index, autopct='%1.1f%%')

plt.title('Hashtags Counts')

plt.show()

#df2['lang'].value_counts()

import matplotlib.pyplot as plt

lang_counts = df2['lang'].value_counts().head(10)

plt.figure(figsize=(10,6))

plt.bar(lang_counts.index, lang_counts.values)

plt.title('Language Counts')

plt.xlabel('Language')

plt.ylabel('Count')

plt.show()

df2['country'] = df2['place'].apply(lambda x: x['country'] if x else None)

df2['countryCode'] = df2['place'].apply(lambda x: x['countryCode'] if x else None)

top_location_of_tweet = df2['country'].value_counts()

top_location_of_tweet.head(20)

import matplotlib.pyplot as plt

country_counts = df2['country'].value_counts().head(10)

plt.figure(figsize=(10,6))

plt.bar(country_counts.index, country_counts.values)

plt.title('Country Counts')

plt.xlabel('Country')

plt.ylabel('Count')

plt.show()

df2['date_column'] = df2['date'].dt.date

#Number of Tweets per day

# group the data by day and count the number of tweets per day

daily_counts = df2.groupby(df2['date'].dt.date).count()

# create a line chart of the daily tweet counts

plt.figure(figsize=(10,6))

plt.plot(daily_counts.index, daily_counts.values)

plt.title('Daily Tweet Counts')

plt.xlabel('Date')

plt.ylabel('Number of Tweets')

plt.show()

# Create a scatter plot of retweet count vs. like count

plt.scatter(df2['retweetCount'], df2['likeCount'])

plt.title('Retweet Count vs. Like Count')

plt.xlabel('Retweet Count')

plt.ylabel('Like Count')

plt.show()

!pip install textblob

import pandas as pd

from textblob import TextBlob

# Define a function to perform sentiment analysis on a tweet using TextBlob

def analyze_sentiment(tweet):

# Create a TextBlob object for the tweet

blob = TextBlob(tweet)

# Use TextBlob to calculate the sentiment polarity of the tweet

polarity = blob.sentiment.polarity

# Return the sentiment polarity

return polarity

# Apply the sentiment analysis function to each tweet in the DataFrame

df2['sentiment'] = df2['processed_text'].apply(analyze_sentiment)

# Print the resulting DataFrame

print(df2.head())

def classify_sentiment(polarity):

if polarity > 0:

return 'positive'

elif polarity < 0:

return 'negative'

else:

return 'neutral'

# Apply the classify_sentiment function to each sentiment polarity value in the DataFrame

df2['sentiment_type'] = df2['sentiment'].apply(classify_sentiment)

sentiment_counts = df2['sentiment_type'].value_counts()

# Create a bar plot of the sentiment counts

plt.bar(sentiment_counts.index, sentiment_counts.values)

# Add a title and labels for the x and y axes

plt.title('Sentiment Analysis')

plt.xlabel('Sentiment Class')

plt.ylabel('Number of Tweets')

# Show the plot

plt.show()

**Create Word Cloud**

very_positive_tweets = df2[df2['sentiment'] > 0.3]

# display only the tweet and sentiment score columns

very_positive_tweets.head()

very_negative_tweets = df2[df2['sentiment'] < 0]

very_negative_tweets.head()

from wordcloud import WordCloud, STOPWORDS

from PIL import Image

# concatenate all the very positive tweets into a single string

all_tweets = ' '.join(very_positive_tweets['processed_text'])

# generate the word cloud

wordcloud = WordCloud(width=800, height=800, background_color='white', colormap='Blues').generate(all_tweets)

# plot the word cloud

plt.figure(figsize=(8, 8), facecolor=None)

plt.imshow(wordcloud)

plt.axis('off')

plt.tight_layout(pad=0)

plt.show()

import nltk

from nltk.corpus import stopwords

import string

from collections import Counter

import nltk

nltk.download('punkt')

all_words = nltk.word_tokenize(all_tweets.lower())

all_words = [word for word in all_words if word not in stopwords.words('english')]

all_words = [word for word in all_words if word not in string.punctuation]

# count the frequency of each word using a dictionary

word_freq = Counter(all_words)

# sort the words by frequency in descending order

sorted_words = sorted(word_freq.items(), key=lambda x: x[1], reverse=True)

# display the top 20 words with their frequencies

top_words = sorted_words[:20]

for word, freq in top_words:

print(f'{word}: {freq}')

# concatenate all the very positive tweets into a single string

all_tweets=" "

all_tweets = ' '.join(very_negative_tweets['processed_text'])

# generate the word cloud

wordcloud = WordCloud(width=800, height=800, background_color='white', colormap='Blues').generate(all_tweets)

# plot the word cloud

plt.figure(figsize=(8, 8), facecolor=None)

plt.imshow(wordcloud)

plt.axis('off')

plt.tight_layout(pad=0)

plt.show()

Sentiment **Analysis**

import pandas as pd

from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import CountVectorizer

from sklearn.naive_bayes import MultinomialNB

from sklearn.metrics import accuracy_score

df3 = df2[['processed_text', 'sentiment_type']]

X_train, X_test, y_train, y_test = train_test_split(df3['processed_text'], df3['sentiment_type'], random_state=0)

# Convert text into numerical vectors using CountVectorizer

vectorizer = CountVectorizer(stop_words='english')

X_train_vec = vectorizer.fit_transform(X_train)

X_test_vec = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier

clf = MultinomialNB()

clf.fit(X_train_vec, y_train)

# Predict on the test set

y_pred = clf.predict(X_test_vec)

# Evaluate the model's accuracy

accuracy = accuracy_score(y_test, y_pred)

print('Accuracy:', accuracy)

SMA EXPERIMENTS

Post a Comment

Popular Posts

Distributed Computing Assignments No.1 SK

Distributed Computing Assignment No.3 SK

Deep Learning Assignment No.1 SK

Social Media Analytics Assignment No.1 SK

Social Media Analytics Assignment No. 3

Comments

Search This Blog

Report Abuse

About Me

About Us

Follow Us

Footer Copyright

Contact form

SMA EXPERIMENTS

You may like these posts

Post a Comment

Contact form