main.R 8.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. # ##########
  2. # Written by: Deben Oldert
  3. #
  4. # Keep in mind that it took me several days/weeks and beers to make this.
  5. # So please give me some credit. Naiba and I won't bite.
  6. #
  7. # This program is called Naiba.
  8. # She can tell you and learn if movie ratings are positive or negative
  9. #
  10. # #########
  11. library(readr)
  12. library(dplyr)
  13. library(stringr)
  14. library(parallel)
  15. source("threaded.R")
  16. SENSITIVITY <- 4
  17. LEARNED.POSITIVE <- paste(getwd(), "learned_positive_5000S.csv", sep = "/")
  18. LEARNED.NEGATIVE <- paste(getwd(), "learned_negative_5000S.csv", sep = "/")
  19. cat("Hey there! My name is Naiba. Nice to meet you.\n")
  20. cat("Thanks to the magic of multi-threading I have", PROCESSES, "brains (CPU). But this only works under a UNIX environment (e.g. MacOS).\n")
  21. cat("Don't you even dare to call the *.threaded functions in a windwows environment.\n")
  22. cat("If you have any question about me, just go to:\n")
  23. cat("https://github.com/DebenOldert/Big-Data-Review-Analysis/blob/master/README.md\n")
  24. cat("But that's enough trashtalk. Let's do this!")
  25. sentiment.train <- function(str, sen){
  26. env <- .GlobalEnv
  27. spl <- sentiment.split(toupper(str))
  28. if((length(spl)-SENSITIVITY+1) < 1) {
  29. cat("You have to train me with more text.\n")
  30. return()
  31. }
  32. for(i in 1:(length(spl)-SENSITIVITY+1)){
  33. grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
  34. if(sen == 1){
  35. mtc <- match(grp, env$positive.cmb)
  36. if(!is.na(mtc)){
  37. env$positive.cnt[mtc] <- env$positive.cnt[mtc] + 1
  38. }
  39. else{
  40. env$positive.cmb <- c(env$positive.cmb, grp)
  41. env$positive.cnt <- c(env$positive.cnt, 1)
  42. }
  43. }
  44. else{
  45. mtc <- match(grp, env$negative.cmb)
  46. if(!is.na(mtc)){
  47. env$negative.cnt[mtc] <- env$negative.cnt[mtc] + 1
  48. }
  49. else{
  50. env$negative.cmb <- c(env$negative.cmb, grp)
  51. env$negative.cnt <- c(env$negative.cnt, 1)
  52. }
  53. }
  54. }
  55. }
  56. sentiment.test <- function(){
  57. env <- .GlobalEnv
  58. cat("Ohh so you want to test me?\n")
  59. cat("Well come on then. Let's do this!\n\n")
  60. cat("First of all. Can you give me the test(set)?\n")
  61. set <- file.choose()
  62. set <- env$set.import(set)
  63. MAX <- nrow(set)
  64. MIN <- 1
  65. if(!console.confirm(paste("Do you want me to test all of the", as.character(nrow(set)), "records?"))){
  66. cat("Well thanks that might just saved me a huge headache.\n")
  67. repeat{
  68. MIN <- console.ask("So where do you want me to start?", type = "integer")
  69. if(MIN > 0 && MIN <= MAX) break
  70. else cat("Please enter a number bigger than 0 and smaller or equal than the records in this set.\n")
  71. }
  72. repeat{
  73. MAX <- console.ask("And where do you want me to stop?", type = "integer")
  74. if(MAX >= MIN && MAX <= nrow(set)) break
  75. else cat(paste("Please enter a number bigger or equal then", as.character(MIN), "and smaller or equal then", as.character(nrow(set))))
  76. }
  77. }
  78. score <- c()
  79. time.start <- Sys.time()
  80. cat("*Intensive thinking* Hmmmm...\n")
  81. progress <- txtProgressBar(min = (MIN - 1), max = MAX, style = 3)
  82. setTxtProgressBar(progress, (MIN - 1))
  83. for(i in MIN:MAX){
  84. test <- sentiment.calc(set[i,]$review, progress = FALSE)
  85. score <- c(score, (test==as.integer(set[i,]$sentiment)))
  86. setTxtProgressBar(progress, i)
  87. }
  88. time.end <- Sys.time()
  89. close(progress)
  90. cat("Phoee... Finally done. Hope I did well...\n")
  91. cat("It took me", format(time.end - time.start, format = "%H:%M:%S"), "\n")
  92. score <- as.integer(mean(score)*100)
  93. if(score > 80){
  94. cat(paste0("OMG! I got ", as.character(score), "% correct!\n"))
  95. }
  96. else{
  97. cat(paste0("Hmm. I'm not quite happy with a score of ", as.character(score), "%\n"))
  98. }
  99. return(score)
  100. }
  101. sentiment.calc <- function(str, progress=TRUE) {
  102. env <- .GlobalEnv
  103. spl <- sentiment.split(toupper(str))
  104. pos <- 0
  105. neg <- 0
  106. if((length(spl)-SENSITIVITY+1) < 1){
  107. stop("I really need some more text to figure this one out.\n")
  108. }
  109. if(progress){
  110. time.start <- Sys.time()
  111. cat("Let me think...\n")
  112. prog <- txtProgressBar(0, (length(spl)-SENSITIVITY+1), style = 3)
  113. }
  114. for(i in 1:(length(spl)-SENSITIVITY+1)){
  115. grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
  116. mtc <- match(grp, env$positive.cmb)
  117. if(!is.na(mtc)){
  118. pos <- pos + env$positive.cnt[mtc]
  119. }
  120. mtc <- match(grp, env$negative.cmb)
  121. if(!is.na(mtc)){
  122. neg <- neg + env$negative.cnt[mtc]
  123. }
  124. if(progress) setTxtProgressBar(prog, i)
  125. }
  126. if(progress){
  127. close(prog)
  128. time.end <- Sys.time()
  129. cat("It took me", format(time.end - time.start, format = "%H:%M:%S"), "\n")
  130. if(pos >= neg){
  131. cat("This must be a POSITIVE review\n")
  132. }
  133. else{
  134. cat("This must be a NEGATIVE review\n")
  135. }
  136. }
  137. return(pos >= neg)
  138. }
  139. sentiment.split <- function(str){
  140. return(na.omit(unlist(strsplit(unlist(str), "[^a-zA-Z]+"))))
  141. }
  142. learn.save <- function(){
  143. env <- .GlobalEnv
  144. if(!file.exists(LEARNED.POSITIVE) || console.confirm("Positive learned file already exists. Overwrite?")){
  145. pos <- data.frame(cmb=env$positive.cmb, cnt=env$positive.cnt)
  146. write_csv(pos, LEARNED.POSITIVE)
  147. }
  148. if(!file.exists(LEARNED.NEGATIVE) || console.confirm("Negative learned file already exists. Overwrite?")){
  149. neg <- data.frame(cmb=env$negative.cmb, cnt=env$negative.cnt)
  150. write_csv(neg, LEARNED.NEGATIVE)
  151. }
  152. }
  153. learn.load <- function(){
  154. env <- .GlobalEnv
  155. if(file.exists(LEARNED.POSITIVE) && file.exists(LEARNED.NEGATIVE)){
  156. if(console.confirm("I found out that I already learned something a while ago. Do you want to use that data?")){
  157. suppressMessages(suppressWarnings(
  158. pos <- read_csv(LEARNED.POSITIVE)
  159. ))
  160. suppressMessages(suppressWarnings(
  161. neg <- read_csv(LEARNED.NEGATIVE)
  162. ))
  163. env$positive.cmb <- pos$cmb
  164. env$positive.cnt <- pos$cnt
  165. env$negative.cmb <- neg$cmb
  166. env$negative.cnt <- neg$cnt
  167. return(TRUE)
  168. }
  169. }
  170. return(FALSE)
  171. }
  172. console.confirm <- function(str){
  173. repeat{
  174. ans <- readline(prompt = paste(str, "[Y|N]: "))
  175. if(ans == "Y") return(TRUE)
  176. if(ans == "N") return(FALSE)
  177. cat("Enter Y or N. Let's try it again.\n")
  178. }
  179. }
  180. console.ask <- function(str, type="string"){
  181. repeat{
  182. ans <- readline(prompt = paste0(str, " [", type, "]: "))
  183. if(type == "string"){
  184. return(ans)
  185. }
  186. if(type == "integer"){
  187. if(grepl("^[0-9]+$", ans)){
  188. return(as.integer(ans))
  189. }
  190. }
  191. cat(paste(type, "only please!", "\n"))
  192. }
  193. }
  194. set.import <- function(fullPath){
  195. suppressMessages(suppressWarnings(
  196. if(endsWith(fullPath, ".tsv")) return(read_delim(fullPath, "\t", escape_backslash = TRUE, escape_double = FALSE, trim_ws = TRUE))
  197. else if(endsWith(fullPath, ".csv")) return(read_csv(fullPath, trim_ws = TRUE))
  198. ))
  199. }
  200. learn.teach <- function(){
  201. env <- .GlobalEnv
  202. if(console.confirm("Do you want to train me so I can be better?")){
  203. if(exists("positive.cmb") && exists("positive.cnt") && exists("negative.cmb") && exists("negative.cnt")){
  204. cat("Hmmm... I already know someting.\n")
  205. if(!console.confirm("Do you want me to continue to learn? (Append learning skillset)")){
  206. env$positive.cmb <- c()
  207. env$positive.cnt <- c()
  208. env$negative.cmb <- c()
  209. env$negative.cnt <- c()
  210. }
  211. }
  212. else{
  213. env$positive.cmb <- c()
  214. env$positive.cnt <- c()
  215. env$negative.cmb <- c()
  216. env$negative.cnt <- c()
  217. }
  218. set <- file.choose()
  219. set <- set.import(set)
  220. MAX <- nrow(set)
  221. MIN <- 1
  222. if(!console.confirm(paste("Do you want me to learn all of the", as.character(nrow(set)), "records?"))){
  223. cat("Well thanks that might just saved me a huge headache.\n")
  224. repeat{
  225. MIN <- console.ask("So where do you want me to start?", type = "integer")
  226. if(MIN > 0 && MIN <= MAX) break
  227. else cat("Please enter a number bigger than 0 and smaller or equal than ", as.character(MAX), "\n")
  228. }
  229. repeat{
  230. MAX <- console.ask("And where do you want me to stop?", type = "integer")
  231. if(MAX >= MIN && MAX <= nrow(set)) break
  232. else cat(paste("Please enter a number bigger or equal then", as.character(MIN), "and smaller or equal then", as.character(nrow(set))))
  233. }
  234. }
  235. cat("Getting smarter...\n")
  236. progress <- txtProgressBar((MIN-1), MAX, style = 3)
  237. setTxtProgressBar(progress, (MIN-1))
  238. for(i in MIN:MAX){
  239. sentiment.train(set[i,]$review, as.integer(set[i,]$sentiment))
  240. setTxtProgressBar(progress, i)
  241. }
  242. close(progress)
  243. if(console.confirm("Let me catch some breath here. Do you want me to remeber this training?")) learn.save()
  244. }
  245. cat("Now that I know everything. There is one thing you should learn.\n")
  246. cat("If you want me to analyse a review just call:\n\n")
  247. cat("sentiment.calc(<any text>)\n\n")
  248. cat("Now let's get started!\n")
  249. }