Просмотр исходного кода

Speed FIX up 100 times faster + Threading update

Deben Oldert 9 лет назад
Родитель
Сommit
1bb209af90
4 измененных файлов с 485 добавлено и 355 удалено
  1. 167 167
      learned_negative.csv
  2. 165 165
      learned_positive.csv
  3. 50 23
      main.R
  4. 103 0
      threaded.R

Разница между файлами не показана из-за своего большого размера
+ 167 - 167
learned_negative.csv


Разница между файлами не показана из-за своего большого размера
+ 165 - 165
learned_positive.csv


+ 50 - 23
main.R

@@ -23,19 +23,23 @@ sentiment.train <- function(str, sen){
   for(i in 1:(length(spl)-SENSITIVITY+1)){
     grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
     if(sen == 1){
-      if(nrow(env$positive %>% filter(str_detect(cmb, grp))) == 1){
-        env$positive[env$positive$cmb==grp,]$cnt <- env$positive[env$positive$cmb==grp,]$cnt + 1
+      mtc <- match(grp, env$positive.cmb)
+      if(!is.na(mtc)){
+        env$positive.cnt[mtc] <- env$positive.cnt[mtc] + 1
       }
       else{
-        env$positive[(nrow(env$positive)+1), ] <- list(as.character(grp), 1)
+        env$positive.cmb <- c(env$positive.cmb, grp)
+        env$positive.cnt <- c(env$positive.cnt, 1)
       }
     }
     else{
-      if(nrow(env$negative %>% filter(str_detect(cmb, grp))) == 1){
-        env$negative[env$negative$cmb==grp,]$cnt <- env$negative[env$negative$cmb==grp,]$cnt + 1
+      mtc <- match(grp, env$negative.cmb)
+      if(!is.na(mtc)){
+        env$negative.cnt[mtc] <- env$negative.cnt[mtc] + 1
       }
       else{
-        env$negative[(nrow(env$negative)+1), ] <- list(as.character(grp), 1)
+        env$negative.cmb <- c(env$negative.cmb, grp)
+        env$negative.cnt <- c(env$negative.cnt, 1)
       }
     }
   }
@@ -118,12 +122,21 @@ sentiment.calc <- function(str, progress=TRUE) {
   for(i in 1:(length(spl)-SENSITIVITY+1)){
     grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
     
-    if(nrow(env$positive[env$positive==grp,]) == 1){
-      pos <- pos + env$positive[env$positive==grp,]$cnt
+    mtc <- match(grp, env$positive.cmb)
+    if(!is.na(mtc)){
+      pos <- pos + env$positive.cnt[mtc]
     }
-    if(nrow(env$negative[env$negative==grp,]) == 1){
-      neg <- neg + env$negative[env$negative==grp,]$cnt
+    mtc <- match(grp, env$negative.cmb)
+    if(!is.na(mtc)){
+      neg <- neg + env$negative.cnt[mtc]
     }
+    
+    # if(nrow(env$positive[env$positive==grp,]) == 1){
+    #   pos <- pos + env$positive[env$positive==grp,]$cnt
+    # }
+    # if(nrow(env$negative[env$negative==grp,]) == 1){
+    #   neg <- neg + env$negative[env$negative==grp,]$cnt
+    # }
     if(progress) setTxtProgressBar(prog, i)
   }
   
@@ -139,8 +152,8 @@ sentiment.calc <- function(str, progress=TRUE) {
       cat("This must be a NEGATIVE review\n")
     }
   }
-  #print(pos)
-  #print(neg)
+  print(pos)
+  print(neg)
   return(pos >= neg)
 }
 
@@ -149,24 +162,32 @@ sentiment.split <- function(str){
 }
 
 learn.save <- function(){
-  env <- parent.frame()
+  env <- .GlobalEnv
   
   if(!file.exists(LEARNED.POSITIVE) || console.confirm("Positive learned file already exists. Overwrite?")){
-    write_csv(env$positive, LEARNED.POSITIVE)
+    pos <- data.frame(cmb=env$positive.cmb, cnt=env$positive.cnt)
+    write_csv(pos, LEARNED.POSITIVE)
   }
   if(!file.exists(LEARNED.NEGATIVE) || console.confirm("Negative learned file already exists. Overwrite?")){
-    write_csv(env$negative, LEARNED.NEGATIVE)
+    neg <- data.frame(cmb=env$negative.cmb, cnt=env$negative.cnt)
+    write_csv(neg, LEARNED.NEGATIVE)
   }
   
 }
 
 learn.load <- function(){
-  env <- parent.frame()
+  env <- .GlobalEnv
   
   if(file.exists(LEARNED.POSITIVE) && file.exists(LEARNED.NEGATIVE)){
     if(console.confirm("I found out that I already learned something a while ago. Do you want to use that data?")){
-      env$positive <- read_csv(LEARNED.POSITIVE)
-      env$negative <- read_csv(LEARNED.NEGATIVE)
+      pos <- read_csv(LEARNED.POSITIVE)
+      neg <- read_csv(LEARNED.NEGATIVE)
+      
+      env$positive.cmb <- pos$cmb
+      env$positive.cnt <- pos$cnt
+      env$negative.cmb <- neg$cmb
+      env$negative.cnt <- neg$cnt
+      
       return(TRUE)
     }
   }
@@ -199,22 +220,28 @@ console.ask <- function(str, type="string"){
 }
 
 set.import <- function(fullPath){
+  suppressMessages(suppressWarnings(
   if(endsWith(fullPath, ".tsv")) return(read_delim(fullPath, "\t", escape_backslash = TRUE, escape_double = FALSE, trim_ws = TRUE))
   else if(endsWith(fullPath, ".csv")) return(read_csv(fullPath, trim_ws = TRUE))
+  ))
 }
 
 learn.teach <- function(){
   if(console.confirm("Do you want to train me so I can be better?")){
-    if(exists("positive") && exists("negative")){
+    if(exists("positive.cmb") && exists("positive.cnt") && exists("negative.cmb") && exists("negative.cnt")){
       cat("Hmmm... I already know someting.\n")
       if(!console.confirm("Do you want me to continue to learn? (Append learning skillset)")){
-        positive <- data.frame(cmb=character(), cnt=integer(), stringsAsFactors = F)
-        negative <- data.frame(cmb=character(), cnt=integer(), stringsAsFactors = F)
+        positive.cmb <- c()
+        positive.cnt <- c()
+        negative.cmb <- c()
+        negative.cnt <- c()
       }
     }
     else{
-      positive <- data.frame(cmb=character(), cnt=integer(), stringsAsFactors = F)
-      negative <- data.frame(cmb=character(), cnt=integer(), stringsAsFactors = F)
+      positive.cmb <- c()
+      positive.cnt <- c()
+      negative.cmb <- c()
+      negative.cnt <- c()
     }
     
     set <- file.choose()

+ 103 - 0
threaded.R

@@ -59,4 +59,107 @@ sentiment.test.threaded <- function(){
   }
   
   return(score)
+}
+
+learn.teach.threaded <- function(){
+  env <- .GlobalEnv
+  
+  if(console.confirm("Do you want to train me so I can be better?")){
+    if(exists("positive.cmb") && exists("positive.cnt") && exists("negative.cmb") && exists("negative.cnt")){
+      cat("Hmmm... I already know someting.\n")
+      if(!console.confirm("Do you want me to continue to learn? (Append learning skillset)")){
+        env$positive.cmb <- c()
+        env$positive.cnt <- c()
+        env$negative.cmb <- c()
+        env$negative.cnt <- c()
+      }
+    }
+    else{
+      env$positive.cmb <- c()
+      env$positive.cnt <- c()
+      env$negative.cmb <- c()
+      env$negative.cnt <- c()
+    }
+    
+    set <- file.choose()
+    
+    set <- set.import(set)
+    
+    MAX <- nrow(set)
+    MIN <- 1
+    
+    if(!console.confirm(paste("Do you want me to learn all of the", as.character(nrow(set)), "records?"))){
+      cat("Well thanks that might just saved me a huge headache.\n")
+      repeat{
+        MIN <- console.ask("So where do you want me to start?", type = "integer")
+        if(MIN > 0 && MIN <= MAX) break
+        else cat("Please enter a number bigger than 0 and smaller or equal than ", as.character(MAX), "\n")
+      }
+      repeat{
+        MAX <- console.ask("And where do you want me to stop?", type = "integer")
+        if(MAX >= MIN && MAX <= nrow(set)) break
+        else cat(paste("Please enter a number bigger or equal then", as.character(MIN), "and smaller or equal then", as.character(nrow(set))))
+      }
+    }
+    
+    worker <- function(x){
+      prt <- as.integer((MAX - MIN + 1) / PROCESSES)
+      if(x < PROCESSES) prt <- (((x - 1) * prt) + 1):(x * prt)
+      else prt <- (((x - 1) * prt) + 1):MAX
+      
+      pos.cmb <- c()
+      pos.cnt <- c()
+      neg.cmb <- c()
+      neg.cnt <- c()
+      
+      for(i in prt){
+        if(set[i,]$sentiment == 1){
+          pos <- sentiment.train.threaded(set[i,]$review, pos.cmb, pos.cnt)
+          pos.cmb <- pos$cmb
+          pos.cnt <- pos$cnt
+        }
+        else{
+          neg <- sentiment.train.threaded(set[i,]$review, neg.cmb, neg.cnt)
+          neg.cmb <- neg$cmb
+          neg.cnt <- neg$cnt
+        }
+      }
+      pos <- data.frame(positive.cmb=pos.cmb, positive.cnt=pos.cnt)
+      neg <- data.frame(negative.cmb=neg.cmb, negative.cnt=neg.cnt)
+      return(c(pos, neg))
+      #return(list(pos.cmb=pos.cmb, pos.cnt=pos.cnt, neg.cmb=neg.cmb, neg.cnt=neg.cnt))
+    }
+    
+    cat("Getting smarter... (No progressbar will be shown, be patient)\n")
+
+    env$answer <- mcmapply(worker, 1:PROCESSES)
+    
+    if(console.confirm("Let me catch some breath here. Do you want me to remeber this training?")) learn.save()
+  }
+  cat("Now that I know everything. There is one thing you should learn.\n")
+  cat("If you want me to analyse a review just call:\n\n")
+  cat("sentiment.calc(<any text>)\n\n")
+  cat("Now let's get started!\n")
+}
+
+sentiment.train.threaded <- function(str, cmb, cnt){
+  spl <- sentiment.split(toupper(str))
+  
+  if((length(spl)-SENSITIVITY+1) < 1) {
+    cat("You have to train me with more text.\n")
+    return(list(cmb=cmb, cnt=cnt))
+  }
+  
+  for(i in 1:(length(spl)-SENSITIVITY+1)){
+    grp <- paste(spl[i:(i+SENSITIVITY-1)], collapse = ' ')
+    mtc <- match(grp, cmb)
+    if(!is.na(mtc)){
+      cnt[mtc] <- cnt[mtc] + 1
+    }
+    else{
+      cmb <- c(cmb, grp)
+      cnt <- c(cnt, 1)
+    }
+  }
+  return(list(cmb=cmb, cnt=cnt))
 }

Некоторые файлы не были показаны из-за большого количества измененных файлов