install.packages(c("tidyverse","tidytext","pdftools","tesseract","tidyr","stringr","ggplot2"))
install.packages("tidyr")
install.packages("devtools")
install.packages("textdata")
library(tidyverse)
library(pdftools)
library(tesseract)
library(tidyr)
library(devtools)
library(textdata)
library(tidytext)
library(dplyr)
library(stringr)
library(ggplot2)
install.packages("ggpubr")
library(ggpubr)
txt = 0
txt <- read_file(paste("C:\\Users\\juand\\Documents\\RLAC ASS 2\\", "americanwoman.txt", sep =""))
txt_sentences <- tibble(text = txt) %>%
unnest_tokens(sentence, text, token = "sentences")
d = nrow(txt_sentences)
tidy_text <- txt_sentences %>%
add_column(linenumber = 1:d, .before = "sentence")
tidy_text <- tidy_text %>%
unnest_tokens(word, sentence) %>%
anti_join(stop_words)
Sentiment analysis bit
text_sentiment = 0
a = 50
text_sentiment <- tidy_text %>%
inner_join(get_sentiments("bing"))
Error in tidy_text %>% inner_join(get_sentiments("bing")) :
could not find function "%>%"
Now that we know that this works, we do it for the five books
books <- c('americanwoman.txt','carryon.txt','lettersfromfrance.txt','lettersfromschoolboy.txt','livingdeadman.txt')
plots <- list()
for(book in books)
{
txt <- read_file(paste("C:\\Users\\juand\\Documents\\RLAC ASS 2\\", book, sep =""))
txt_sentences <- tibble(text = txt) %>%
unnest_tokens(sentence, text, token = "sentences")
d = nrow(txt_sentences)
tidy_text <- txt_sentences %>%
add_column(linenumber = 1:d, .before = "sentence")
tidy_text <- tidy_text %>%
unnest_tokens(word, sentence) %>%
anti_join(stop_words)
text_sentiment = 0
a = 50
text_sentiment <- tidy_text %>%
inner_join(get_sentiments("bing"))
text_sentiment$linenumber <- text_sentiment$linenumber%/%a
text_sentiment <- text_sentiment %>% count(index= linenumber,sentiment) %>% spread(key = sentiment, value = n)
text_sentiment$total = NA
text_sentiment$total <- text_sentiment$positive - text_sentiment$negative
P <- ggplot(text_sentiment, aes(index, total, colour = total)) +
geom_point(position=position_jitter(0.3)) + geom_smooth() + xlab(book)
plots[[length(plots) + 1]] <- P
}
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
Joining, by = "word"
ggarrange(plotlist = plots, ncol = 2)
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
$`1`
$`2`
$`3`
attr(,"class")
[1] "list" "ggarrange"



OK ALL THE LETTERS SEEM TO GO DOWN AT THE 25 to 50% mark of the book. We can take a sample of that and one of the rest of the books and see what the differences areeeee
LS0tDQp0aXRsZTogIldhciBMZXR0ZXJzIg0Kb3V0cHV0OiBodG1sX25vdGVib29rDQplZGl0b3Jfb3B0aW9uczogDQogIGNodW5rX291dHB1dF90eXBlOiBpbmxpbmUNCi0tLQ0KDQpgYGB7cn0NCmluc3RhbGwucGFja2FnZXMoYygidGlkeXZlcnNlIiwidGlkeXRleHQiLCJwZGZ0b29scyIsInRlc3NlcmFjdCIsInRpZHlyIiwic3RyaW5nciIsImdncGxvdDIiKSkNCmluc3RhbGwucGFja2FnZXMoInRpZHlyIikNCmluc3RhbGwucGFja2FnZXMoImRldnRvb2xzIikNCmluc3RhbGwucGFja2FnZXMoInRleHRkYXRhIikNCmBgYA0KDQpgYGB7cn0NCmxpYnJhcnkodGlkeXZlcnNlKQ0KbGlicmFyeShwZGZ0b29scykNCmxpYnJhcnkodGVzc2VyYWN0KQ0KbGlicmFyeSh0aWR5cikNCmxpYnJhcnkoZGV2dG9vbHMpDQpsaWJyYXJ5KHRleHRkYXRhKQ0KbGlicmFyeSh0aWR5dGV4dCkNCmxpYnJhcnkoZHBseXIpDQpsaWJyYXJ5KHN0cmluZ3IpDQpsaWJyYXJ5KGdncGxvdDIpDQpgYGANCg0KDQpgYGB7cn0NCmluc3RhbGwucGFja2FnZXMoImdncHViciIpDQpsaWJyYXJ5KGdncHVicikNCmBgYA0KDQoNCmBgYHtyfQ0KdHh0ID0gMA0KdHh0IDwtIHJlYWRfZmlsZShwYXN0ZSgiQzpcXFVzZXJzXFxqdWFuZFxcRG9jdW1lbnRzXFxSTEFDIEFTUyAyXFwiLCAiYW1lcmljYW53b21hbi50eHQiLCBzZXAgPSIiKSkNCmBgYA0KDQpgYGB7cn0NCnR4dF9zZW50ZW5jZXMgPC0gdGliYmxlKHRleHQgPSB0eHQpICU+JSANCiAgdW5uZXN0X3Rva2VucyhzZW50ZW5jZSwgdGV4dCwgdG9rZW4gPSAic2VudGVuY2VzIikgDQoNCmQgPSBucm93KHR4dF9zZW50ZW5jZXMpDQoNCnRpZHlfdGV4dCA8LSB0eHRfc2VudGVuY2VzICU+JSANCiAgYWRkX2NvbHVtbihsaW5lbnVtYmVyID0gMTpkLCAuYmVmb3JlID0gInNlbnRlbmNlIikNCg0KdGlkeV90ZXh0IDwtIHRpZHlfdGV4dCAlPiUNCiAgdW5uZXN0X3Rva2Vucyh3b3JkLCBzZW50ZW5jZSkgJT4lDQogIGFudGlfam9pbihzdG9wX3dvcmRzKQ0KDQpgYGANCg0KDQpTZW50aW1lbnQgYW5hbHlzaXMgYml0DQpgYGB7cn0NCg0KdGV4dF9zZW50aW1lbnQgPSAwDQphID0gNTANCg0KdGV4dF9zZW50aW1lbnQgPC0gdGlkeV90ZXh0ICU+JQ0KICBpbm5lcl9qb2luKGdldF9zZW50aW1lbnRzKCJiaW5nIikpDQoNCnRleHRfc2VudGltZW50JGxpbmVudW1iZXIgPC0gdGV4dF9zZW50aW1lbnQkbGluZW51bWJlciUvJWENCg0KdGV4dF9zZW50aW1lbnQgPC0gdGV4dF9zZW50aW1lbnQgJT4lIGNvdW50KGluZGV4PSBsaW5lbnVtYmVyLHNlbnRpbWVudCkgJT4lIHNwcmVhZChrZXkgPSBzZW50aW1lbnQsIHZhbHVlID0gbikNCg0KdGV4dF9zZW50aW1lbnQkdG90YWwgPSBOQQ0KDQp0ZXh0X3NlbnRpbWVudCR0b3RhbCA8LSB0ZXh0X3NlbnRpbWVudCRwb3NpdGl2ZSAtIHRleHRfc2VudGltZW50JG5lZ2F0aXZlDQoNCmdncGxvdCh0ZXh0X3NlbnRpbWVudCwgYWVzKGluZGV4LCB0b3RhbCwgY29sb3VyID0gdG90YWwpKSArDQogICBnZW9tX3BvaW50KHBvc2l0aW9uPXBvc2l0aW9uX2ppdHRlcigwLjMpKSArIGdlb21fc21vb3RoKCkNCmBgYA0KDQpOb3cgdGhhdCB3ZSBrbm93IHRoYXQgdGhpcyB3b3Jrcywgd2UgZG8gaXQgZm9yIHRoZSBmaXZlIGJvb2tzDQpgYGB7cn0NCiAgYm9va3MgPC0gYygnYW1lcmljYW53b21hbi50eHQnLCdjYXJyeW9uLnR4dCcsJ2xldHRlcnNmcm9tZnJhbmNlLnR4dCcsJ2xldHRlcnNmcm9tc2Nob29sYm95LnR4dCcsJ2xpdmluZ2RlYWRtYW4udHh0JykNCg0KcGxvdHMgPC0gbGlzdCgpDQoNCmZvcihib29rIGluIGJvb2tzKQ0Kew0KICB0eHQgPC0gcmVhZF9maWxlKHBhc3RlKCJDOlxcVXNlcnNcXGp1YW5kXFxEb2N1bWVudHNcXFJMQUMgQVNTIDJcXCIsIGJvb2ssIHNlcCA9IiIpKQ0KICB0eHRfc2VudGVuY2VzIDwtIHRpYmJsZSh0ZXh0ID0gdHh0KSAlPiUgDQogIHVubmVzdF90b2tlbnMoc2VudGVuY2UsIHRleHQsIHRva2VuID0gInNlbnRlbmNlcyIpIA0KDQpkID0gbnJvdyh0eHRfc2VudGVuY2VzKQ0KDQp0aWR5X3RleHQgPC0gdHh0X3NlbnRlbmNlcyAlPiUgDQogIGFkZF9jb2x1bW4obGluZW51bWJlciA9IDE6ZCwgLmJlZm9yZSA9ICJzZW50ZW5jZSIpDQoNCnRpZHlfdGV4dCA8LSB0aWR5X3RleHQgJT4lDQogIHVubmVzdF90b2tlbnMod29yZCwgc2VudGVuY2UpICU+JQ0KICBhbnRpX2pvaW4oc3RvcF93b3JkcykNCnRleHRfc2VudGltZW50ID0gMA0KYSA9IDUwDQoNCnRleHRfc2VudGltZW50IDwtIHRpZHlfdGV4dCAlPiUNCiAgaW5uZXJfam9pbihnZXRfc2VudGltZW50cygiYmluZyIpKQ0KDQp0ZXh0X3NlbnRpbWVudCRsaW5lbnVtYmVyIDwtIHRleHRfc2VudGltZW50JGxpbmVudW1iZXIlLyVhDQoNCnRleHRfc2VudGltZW50IDwtIHRleHRfc2VudGltZW50ICU+JSBjb3VudChpbmRleD0gbGluZW51bWJlcixzZW50aW1lbnQpICU+JSBzcHJlYWQoa2V5ID0gc2VudGltZW50LCB2YWx1ZSA9IG4pDQoNCnRleHRfc2VudGltZW50JHRvdGFsID0gTkENCg0KdGV4dF9zZW50aW1lbnQkdG90YWwgPC0gdGV4dF9zZW50aW1lbnQkcG9zaXRpdmUgLSB0ZXh0X3NlbnRpbWVudCRuZWdhdGl2ZQ0KDQpQIDwtIGdncGxvdCh0ZXh0X3NlbnRpbWVudCwgYWVzKGluZGV4LCB0b3RhbCwgY29sb3VyID0gdG90YWwpKSArDQogICBnZW9tX3BvaW50KHBvc2l0aW9uPXBvc2l0aW9uX2ppdHRlcigwLjMpKSArIGdlb21fc21vb3RoKCkgKyB4bGFiKGJvb2spDQoNCnBsb3RzW1tsZW5ndGgocGxvdHMpICsgMV1dIDwtIFANCiAgDQp9DQoNCmdnYXJyYW5nZShwbG90bGlzdCA9IHBsb3RzLCBuY29sID0gMikNCg0KDQoNCmBgYA0KT0sgQUxMIFRIRSBMRVRURVJTIFNFRU0gVE8gR08gRE9XTiBBVCBUSEUgMjUgdG8gNTAlIG1hcmsgb2YgdGhlIGJvb2suIFdlIGNhbiB0YWtlIGEgc2FtcGxlIG9mIHRoYXQgYW5kIG9uZSBvZiB0aGUgcmVzdCBvZiB0aGUgYm9va3MgYW5kIHNlZSB3aGF0IHRoZSBkaWZmZXJlbmNlcyBhcmVlZWVlDQo=