http://docs.inboundnow.com/guide/create-twitter-application/
twitterauth<-function(){
#install.packages("TwitteR")
require('twitteR')
api_key<- "put api key here"
api_secret <- "put api sceret key here"
access_token <- "put access token here"
access_token_secret <- "pu access secret here"
setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
}
save the Oauth token passing yes as argument
> setup_twitter_oauth(api_key,api_secret,access_token,access_token_secret)
[1] "Using direct authentication"
Use a local file ('.httr-oauth'), to cache OAuth access credentials between R sessions?
1: Yes
2: No
Selection:yes
search the user timeline and save the data as a data frame
tweets<-userTimeline(tweetuser,n=nooftweets)
tweets.df<-twListToDF(tweets)
clean the tweets
clean_tweet = laply(tweets.df$text, function(tweet) {
tweet = gsub('https://','',tweet) # removes https://
tweet = gsub('http://','',tweet) # removes http://
tweet=gsub('[^[:graph:]]', ' ',tweet) ## removes graphic characters
#like emoticons
tweet = gsub('[[:punct:]]', '', tweet) # removes punctuation
tweet = gsub('[[:cntrl:]]', '', tweet) # removes control characters
tweet = gsub('\\d+', '', tweet) # removes numbers
tweet=str_replace_all(tweet,"[^[:graph:]]", " ")
tweet = tolower(tweet) # makes all letters lowercase
})
tweets.df$text<-clean_tweet
get user profile information
tweets.df$text<-clean_tweet
temp_df <- twListToDF(lookupUsers(tweetuser))
temp_df
replicate the information on the basis on no of tweets and bind the information in a new table
a<-temp_df[rep(seq_len(nrow(temp_df)), each=length(tweets)),]
b<-cbind(a,tweets.df$created,tweets.df$text)
rename the columns and save these as a username.csv file
b<-cbind(a,tweets.df$created,tweets.df$text)
names(b)[1]<-paste("user name")
names(b)[18]<-paste("tweets date")
names(b)[19]<-paste("tweet ")
dim(b)
colnames(b)
filename<-tweetuser
Format<-"csv"
File<-paste(filename,Format,sep=".")
write.csv(b, file = File)
create a wrap function for user data retrival on the basis of no of tweets
Userdataret<-function(tweetuser,nooftweets){
setwd('C:\\Users\\guptakas\\Rscripts\\data\\user')
require('dplyr')
require('twitteR')
require('stringr')
#tweets<-searchTwitter(tweetuser,n=nooftweets)
tweets<-userTimeline(tweetuser,n=nooftweets)
tweets.df<-twListToDF(tweets)
clean_tweet = laply(tweets.df$text, function(tweet) {
tweet = gsub('https://','',tweet) # removes https://
tweet = gsub('http://','',tweet) # removes http://
tweet=gsub('[^[:graph:]]', ' ',tweet) ## removes graphic characters
#like emoticons
tweet = gsub('[[:punct:]]', '', tweet) # removes punctuation
tweet = gsub('[[:cntrl:]]', '', tweet) # removes control characters
tweet = gsub('\\d+', '', tweet) # removes numbers
tweet=str_replace_all(tweet,"[^[:graph:]]", " ")
tweet = tolower(tweet) # makes all letters lowercase
})
tweets.df$text<-clean_tweet
temp_df <- twListToDF(lookupUsers(tweetuser))
temp_df
colnames(temp_df)
a<-temp_df[rep(seq_len(nrow(temp_df)), each=length(tweets)),]
a
b<-cbind(a,tweets.df$created,tweets.df$text)
names(b)[1]<-paste("user name")
names(b)[18]<-paste("tweets date")
names(b)[19]<-paste("tweet ")
dim(b)
colnames(b)
filename<-tweetuser
Format<-"csv"
File<-paste(filename,Format,sep=".")
write.csv(b, file = File)
}
User <- getUser("paas the user name here")
User_Friends_IDs<-User$getFriends(n=nooffriends)##pass no of friends u want to retrieve
length(User_Friends_IDs)##check if retrieve correctly or not
friends.df<-twListToDF(User_Friends_IDs)
for (x in friends.df$name) { ##loop over each friends and collect their information
Frnddataret(x,no_of_tweets)
}
Corpus comes with built-in support for the algorithmic stemmers provided by the Snowball Stemming Library, which supports the following languages: arabic (ar), danish (da), german (de), english (en), spanish (es), finnish (fi), french (fr), hungarian (hu), italian (it), dutch (nl), norwegian (no), portuguese (pt), romanian (ro), russian (ru), swedish (sv), tamil (ta), and turkish (tr). You can select one of these stemmers using either the full name of the language of the two-letter country code
text <- "love loving lovingly loved lover lovely love"
text_tokens(text, stemmer = "en") # english stemmer
[[1]]
[1] "love" "love" "love" "love" "lover" "love" "love"
One common way to build a stemmer is from a list of (term, stem) pairs. Many such lists are available at lexoconista.com download the dictonary
url <- "http://www.lexiconista.com/Datasets/lemmatization-en.zip"
tmp <- tempfile()
download.file(url, tmp)
extract the dictionary in a frame
con <- unz(tmp, "lemmatization-en.txt", encoding = "UTF-8")
tab <- read.delim(con, header=FALSE, stringsAsFactors = FALSE)
names(tab) <- c("stem", "term")
head(tab)
define the stemming function
stem_list <- function(term) {
i <- match(term, tab$term)
if (is.na(i)) {
stem <- term
} else {
stem <- tab$stem[[i]]
}
stem
}
apply on a dataset laply take each line of text and perform operations on that build under plyr package
dataset$tweet.<-laply(dataset$tweet.,function(text){
x<-text_tokens(text, stemmer = stem_list) # english stemmer
text<-paste(unlist(x), collapse = ' ')
})
stemop<-function(dataset){
require('plyr')
require('corpus')
#text stemming for live data
url <- "http://www.lexiconista.com/Datasets/lemmatization-en.zip"
tmp <- tempfile()
download.file(url, tmp)
# extract the contents
con <- unz(tmp, "lemmatization-en.txt", encoding = "UTF-8")
tab <- read.delim(con, header=FALSE, stringsAsFactors = FALSE)
names(tab) <- c("stem", "term")
head(tab)
tab
stem_list <- function(term) {
i <- match(term, tab$term)
if (is.na(i)) {
stem <- term
} else {
stem <- tab$stem[[i]]
}
stem
}
dataset$tweet.<-laply(dataset$tweet.,function(text){
x<-text_tokens(text, stemmer = stem_list) # english stemmer
text<-paste(unlist(x), collapse = ' ')
})
return(dataset)
}
apply the stemming function on a dataset
nm<-read.csv('narendramodi.csv')
stemdataset<-stemop(nm)
nm$tweet.[264]
stemdataset$tweet.[264]
output
> nm$tweet.[264]
[1] it is gladdening to witness a very healthy spirit of competition among the states to draw maximum investment this tcoqejbqfzcla
264 Levels: ...
> stemdataset$tweet.[264]
[1] "it be gladden to witness a very healthy spirit of competition among the state to draw maximum investment this tcoqejbqfzcla"
_ gladdening is converted to root gladden_
This uses system facilities to convert a character vector between encodings: the ‘i’ stands for ‘internationalization’
### loading and preprocessing a training set of tweets
# function for converting some symbols
conv_fun <- function(x) iconv(x, "latin1", "ASCII", "")
load the tweets and assign columnnames
tweets_classified <- read.csv('training.1600000.processed.noemoticon.csv',stringsAsFactors=FALSE)
dim(tweets_classified)
colnames(tweets_classified)<- c('sentiment', 'id', 'date', 'query', 'user', 'text')
head(tweets_classified$text)
> head(tweets_classified$text)
[1] "is upset that he can't update his Facebook by texting it... and might cry as a result School today also. Blah!"
[2] "@Kenichan I dived many times for the ball. Managed to save 50% The rest go out of bounds"
[3] "my whole body feels itchy and like its on fire "
[4] "@nationwideclass no, it's not behaving at all. i'm mad. why am i here? because I can't see you all over there. "
[5] "@Kwesidei not the whole crew "
> head(tweets_classified$sentiment)
[1] 0 0 0 0 0
convert the sentiment classes to 0&1 and tweets character from latint to ascii
tweets_classified%>%
# converting some symbols
dmap_at('text', conv_fun) %>%
# replacing class values
mutate(sentiment = ifelse(sentiment == 0, 0, 1))
80% to train and 20% test
# data splitting on train and test
set.seed(2340)
trainIndex <- createDataPartition(tweets_classified$sentiment, p = 0.8,
list = FALSE,
times = 1)
tweets_train <- tweets_classified[trainIndex, ]
tweets_test <- tweets_classified[-trainIndex, ]
##### doc2vec #####
# define preprocessing function and tokenization function
_word tokenizer will split the text to words and itoken function divide the data on processable chunks_
prep_fun <- tolower
tok_fun <- word_tokenizer
it_train <- itoken(tweets_train$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = tweets_train$id,
progressbar = TRUE)
it_test <- itoken(tweets_test$text,
preprocessor = prep_fun,
tokenizer = tok_fun,
ids = tweets_test$id,
progressbar = TRUE)
> it_train
<itoken>
Inherits from: <iterator>
Public:
chunk_size: 128000
clone: function (deep = FALSE)
counter: 0
ids: 1467810672 1467810917 1467811184 1467811193 1467811592 1 ...
initialize: function (iterable, ids = NULL, n_chunks = 10, progress_ = interactive(),
is_complete: active binding
iterable: is upset that he can't update his Facebook by texting it ...
length: active binding
nextElem: function ()
preprocessor: list
progress: TRUE
progressbar: txtProgressBar
tokenizer: list
> it_test
<itoken>
Inherits from: <iterator>
Public:
chunk_size: 32000
clone: function (deep = FALSE)
counter: 0
ids: 1467811372 1467812784 1467814180 1467815988 1467817502 1 ...
initialize: function (iterable, ids = NULL, n_chunks = 10, progress_ = interactive(),
is_complete: active binding
iterable: @Kwesidei not the whole crew @smarrison i would've been ...
length: active binding
nextElem: function ()
preprocessor: list
progress: TRUE
progressbar: txtProgressBar
tokenizer: list
create a vocabulary of words and create a dtm for train and test using vocablary it will show progress at increment of 10% because of 10 chunks
# creating vocabulary and document-term matrix
vocab <- create_vocabulary(it_train)
vocab
vectorizer <- vocab_vectorizer(vocab)
vectorizer
dtm_train <- create_dtm(it_train, vectorizer)
dtm_test <- create_dtm(it_test, vectorizer)
> dtm_train <- create_dtm(it_train, vectorizer)
|============================================ | 40%
> dtm_train <- create_dtm(it_train, vectorizer)
|==============================================================================================================| 100%
> dtm_train
1280000 x 615245 sparse Matrix
it contains 615245 unique words in training set
# define tf-idf model
tfidf <- TfIdf$new()
# fit the model to the train data and transform it with the fitted model
dtm_train_tfidf <- fit_transform(dtm_train, tfidf)
dtm_test_tfidf <- fit_transform(dtm_test, tfidf)
# train the model
t1 <- Sys.time()
glmnet_classifier <- cv.glmnet(x = dtm_train_tfidf, y = tweets_train[['sentiment']],
family = 'binomial',
# L1 penalty
alpha = 1,
# interested in the area under ROC curve
type.measure = "auc",
# 5-fold cross-validation
nfolds = 5,
# high value is less accurate, but has faster training
thresh = 1e-3,
# again lower number of iterations for faster training
maxit = 1e3)
glmnet_classifier<-readRDS('glmnet_classifier.RDS')
print(difftime(Sys.time(), t1, units = 'mins'))
> print(difftime(Sys.time(), t1, units = 'mins'))
Time difference of 245.54826407 mins
> print(paste("max AUC =", round(max(glmnet_classifier$cvm), 4)))
[1] "max AUC = 0.8746"
> preds <- predict(glmnet_classifier, dtm_test_tfidf, type = 'response')[ ,1]
> head(tweets_test$text)
[1] "@Kwesidei not the whole crew "
[2] "@smarrison i would've been the first, but i didn't have a gun. not really though, zac snyder's just a doucheclown."
[3] "this week is not going as i had hoped "
[4] "thought sleeping in was an option tomorrow but realizing that it now is not. evaluations in the morning and work in the afternoon! "
[5] "@fleurylis I don't either. Its depressing. I don't think I even want to know about the kids in suitcases. "
[6] "He's the reason for the teardrops on my guitar the only one who has enough of me to break my heart "
> head(preds)
1467811372 1467812784 1467814180 1467815988 1467817502 1467818481
#save the model and Vocabulary for prediction
saveRDS(glmnet_classifier, 'glmnet_classifier.RDS')
saveRDS(vectorizer,'Vectorizer.RDS')
#######################################################
For more details see GitHub Flavored Markdown.
Your Pages site will use the layout and styles from the Jekyll theme you have selected in your repository settings. The name of this theme is saved in the Jekyll _config.yml
configuration file.
Having trouble with Pages? Check out our documentation or contact support and we’ll help you sort it out.