main.R 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286
  1. library(readr)
  2. library(dplyr)
  3. library(stringr)
  4. library(parallel)
  5. SENSITIVITY <- 4
  6. #MAX <- nrow(DATA1)
  7. MAX <- 500
  8. LEARNED.POSITIVE <- paste(getwd(), "learned_positive.csv", sep = "/")
  9. LEARNED.NEGATIVE <- paste(getwd(), "learned_negative.csv", sep = "/")
  10. sentiment.train <- function(str, sen){
  11. env <- .GlobalEnv
  12. spl <- sentiment.split(toupper(str))
  13. if((length(spl)-SENSITIVITY+1) < 1) {
  14. cat("You have to train me with more text.\n")
  15. return()
  16. }
  17. for(i in 1:(length(spl)-SENSITIVITY+1)){
  18. grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
  19. if(sen == 1){
  20. mtc <- match(grp, env$positive.cmb)
  21. if(!is.na(mtc)){
  22. env$positive.cnt[mtc] <- env$positive.cnt[mtc] + 1
  23. }
  24. else{
  25. env$positive.cmb <- c(env$positive.cmb, grp)
  26. env$positive.cnt <- c(env$positive.cnt, 1)
  27. }
  28. }
  29. else{
  30. mtc <- match(grp, env$negative.cmb)
  31. if(!is.na(mtc)){
  32. env$negative.cnt[mtc] <- env$negative.cnt[mtc] + 1
  33. }
  34. else{
  35. env$negative.cmb <- c(env$negative.cmb, grp)
  36. env$negative.cnt <- c(env$negative.cnt, 1)
  37. }
  38. }
  39. }
  40. }
  41. sentiment.test <- function(){
  42. env <- .GlobalEnv
  43. cat("Ohh so you want to test me?\n")
  44. cat("Well come on then. Let's do this!\n\n")
  45. cat("First of all. Can you give me the test(set)?\n")
  46. set <- file.choose()
  47. set <- env$set.import(set)
  48. MAX <- nrow(set)
  49. MIN <- 1
  50. if(!console.confirm(paste("Do you want me to test all of the", as.character(nrow(set)), "records?"))){
  51. cat("Well thanks that might just saved me a huge headache.\n")
  52. repeat{
  53. MIN <- console.ask("So where do you want me to start?", type = "integer")
  54. if(MIN > 0 && MIN <= MAX) break
  55. else cat("Please enter a number bigger than 0 and smaller or equal than the records in this set.\n")
  56. }
  57. repeat{
  58. MAX <- console.ask("And where do you want me to stop?", type = "integer")
  59. if(MAX >= MIN && MAX <= nrow(set)) break
  60. else cat(paste("Please enter a number bigger or equal then", as.character(MIN), "and smaller or equal then", as.character(nrow(set))))
  61. }
  62. }
  63. score <- c()
  64. time.start <- Sys.time()
  65. cat("*Intensive thinking* Hmmmm...\n")
  66. progress <- txtProgressBar(min = (MIN - 1), max = MAX, style = 3)
  67. setTxtProgressBar(progress, (MIN - 1))
  68. for(i in MIN:MAX){
  69. test <- sentiment.calc(set[i,]$review, progress = FALSE)
  70. score <- c(score, (test==as.integer(set[i,]$sentiment)))
  71. setTxtProgressBar(progress, i)
  72. }
  73. time.end <- Sys.time()
  74. close(progress)
  75. cat("Phoee... Finally done. Hope I did well...\n")
  76. cat("It took me", format(time.end - time.start, format = "%H:%M:%S"), "\n")
  77. score <- as.integer(mean(score)*100)
  78. if(score > 80){
  79. cat(paste0("OMG! I got ", as.character(score), "% correct!\n"))
  80. }
  81. else{
  82. cat(paste0("Hmm. I'm not quite happy with a score of ", as.character(score), "%\n"))
  83. }
  84. return(score)
  85. }
  86. sentiment.calc <- function(str, progress=TRUE) {
  87. env <- .GlobalEnv
  88. spl <- sentiment.split(toupper(str))
  89. pos <- 0
  90. neg <- 0
  91. data <- data.frame(grp=character())
  92. if((length(spl)-SENSITIVITY+1) < 1){
  93. stop("I really need some more text to figure this one out.\n")
  94. }
  95. if(progress){
  96. time.start <- Sys.time()
  97. cat("Let me think...\n")
  98. prog <- txtProgressBar(0, (length(spl)-SENSITIVITY+1), style = 3)
  99. }
  100. for(i in 1:(length(spl)-SENSITIVITY+1)){
  101. grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
  102. mtc <- match(grp, env$positive.cmb)
  103. if(!is.na(mtc)){
  104. pos <- pos + env$positive.cnt[mtc]
  105. }
  106. mtc <- match(grp, env$negative.cmb)
  107. if(!is.na(mtc)){
  108. neg <- neg + env$negative.cnt[mtc]
  109. }
  110. # if(nrow(env$positive[env$positive==grp,]) == 1){
  111. # pos <- pos + env$positive[env$positive==grp,]$cnt
  112. # }
  113. # if(nrow(env$negative[env$negative==grp,]) == 1){
  114. # neg <- neg + env$negative[env$negative==grp,]$cnt
  115. # }
  116. if(progress) setTxtProgressBar(prog, i)
  117. }
  118. if(progress){
  119. close(prog)
  120. time.end <- Sys.time()
  121. cat("It took me", format(time.end - time.start, format = "%H:%M:%S"), "\n")
  122. if(pos >= neg){
  123. cat("This must be a POSITIVE review\n")
  124. }
  125. else{
  126. cat("This must be a NEGATIVE review\n")
  127. }
  128. }
  129. print(pos)
  130. print(neg)
  131. return(pos >= neg)
  132. }
  133. sentiment.split <- function(str){
  134. return(na.omit(unlist(strsplit(unlist(str), "[^a-zA-Z]+"))))
  135. }
  136. learn.save <- function(){
  137. env <- .GlobalEnv
  138. if(!file.exists(LEARNED.POSITIVE) || console.confirm("Positive learned file already exists. Overwrite?")){
  139. pos <- data.frame(cmb=env$positive.cmb, cnt=env$positive.cnt)
  140. write_csv(pos, LEARNED.POSITIVE)
  141. }
  142. if(!file.exists(LEARNED.NEGATIVE) || console.confirm("Negative learned file already exists. Overwrite?")){
  143. neg <- data.frame(cmb=env$negative.cmb, cnt=env$negative.cnt)
  144. write_csv(neg, LEARNED.NEGATIVE)
  145. }
  146. }
  147. learn.load <- function(){
  148. env <- .GlobalEnv
  149. if(file.exists(LEARNED.POSITIVE) && file.exists(LEARNED.NEGATIVE)){
  150. if(console.confirm("I found out that I already learned something a while ago. Do you want to use that data?")){
  151. pos <- read_csv(LEARNED.POSITIVE)
  152. neg <- read_csv(LEARNED.NEGATIVE)
  153. env$positive.cmb <- pos$cmb
  154. env$positive.cnt <- pos$cnt
  155. env$negative.cmb <- neg$cmb
  156. env$negative.cnt <- neg$cnt
  157. return(TRUE)
  158. }
  159. }
  160. return(FALSE)
  161. }
  162. console.confirm <- function(str){
  163. repeat{
  164. ans <- readline(prompt = paste(str, "[Y|N]: "))
  165. if(ans == "Y") return(TRUE)
  166. if(ans == "N") return(FALSE)
  167. cat("Enter Y or N. Let's try it again.\n")
  168. }
  169. }
  170. console.ask <- function(str, type="string"){
  171. repeat{
  172. ans <- readline(prompt = paste0(str, " [", type, "]: "))
  173. if(type == "string"){
  174. return(ans)
  175. }
  176. if(type == "integer"){
  177. if(grepl("^[0-9]+$", ans)){
  178. return(as.integer(ans))
  179. }
  180. }
  181. cat(paste(type, "only please!", "\n"))
  182. }
  183. }
  184. set.import <- function(fullPath){
  185. suppressMessages(suppressWarnings(
  186. if(endsWith(fullPath, ".tsv")) return(read_delim(fullPath, "\t", escape_backslash = TRUE, escape_double = FALSE, trim_ws = TRUE))
  187. else if(endsWith(fullPath, ".csv")) return(read_csv(fullPath, trim_ws = TRUE))
  188. ))
  189. }
  190. learn.teach <- function(){
  191. if(console.confirm("Do you want to train me so I can be better?")){
  192. if(exists("positive.cmb") && exists("positive.cnt") && exists("negative.cmb") && exists("negative.cnt")){
  193. cat("Hmmm... I already know someting.\n")
  194. if(!console.confirm("Do you want me to continue to learn? (Append learning skillset)")){
  195. positive.cmb <- c()
  196. positive.cnt <- c()
  197. negative.cmb <- c()
  198. negative.cnt <- c()
  199. }
  200. }
  201. else{
  202. positive.cmb <- c()
  203. positive.cnt <- c()
  204. negative.cmb <- c()
  205. negative.cnt <- c()
  206. }
  207. set <- file.choose()
  208. set <- set.import(set)
  209. MAX <- nrow(set)
  210. MIN <- 1
  211. if(!console.confirm(paste("Do you want me to learn all of the", as.character(nrow(set)), "records?"))){
  212. cat("Well thanks that might just saved me a huge headache.\n")
  213. repeat{
  214. MIN <- console.ask("So where do you want me to start?", type = "integer")
  215. if(MIN > 0 && MIN <= MAX) break
  216. else cat("Please enter a number bigger than 0 and smaller or equal than ", as.character(MAX), "\n")
  217. }
  218. repeat{
  219. MAX <- console.ask("And where do you want me to stop?", type = "integer")
  220. if(MAX >= MIN && MAX <= nrow(set)) break
  221. else cat(paste("Please enter a number bigger or equal then", as.character(MIN), "and smaller or equal then", as.character(nrow(set))))
  222. }
  223. }
  224. cat("Getting smarter...\n")
  225. progress <- txtProgressBar((MIN-1), MAX, style = 3)
  226. setTxtProgressBar(progress, (MIN-1))
  227. for(i in MIN:MAX){
  228. sentiment.train(set[i,]$review, as.integer(set[i,]$sentiment))
  229. setTxtProgressBar(progress, i)
  230. }
  231. close(progress)
  232. if(console.confirm("Let me catch some breath here. Do you want me to remeber this training?")) learn.save()
  233. }
  234. cat("Now that I know everything. There is one thing you should learn.\n")
  235. cat("If you want me to analyse a review just call:\n\n")
  236. cat("sentiment.calc(<any text>)\n\n")
  237. cat("Now let's get started!\n")
  238. }