main.R 6.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250
  1. # (c) Deben Oldert
  2. # Checks the difference between multiple movie rating from different websites
  3. # Load libraries
  4. suppressMessages(library(dplyr))
  5. suppressMessages(library(stringr))
  6. # Set global vars
  7. wd <- getwd()
  8. # Set common functions
  9. returnPath <- function(pth) {
  10. return(paste(wd, pth, sep = "/"))
  11. }
  12. # 1. IMDB
  13. # 2. GroupLens
  14. # 3. Netflix
  15. set_names <- c("IMDB", "GroupLens", "Netflix")
  16. # Loading .CSV's
  17. print("Prepairing IMDB...")
  18. imdb <- read.csv(returnPath("datasets/imdb/imdb.csv"), row.names=1)
  19. imdb <- select(tbl_df(imdb), title, rating, year)
  20. imdb <- mutate(imdb, title = paste(title, " (", year, ")", sep = ""))
  21. print("DONE")
  22. print("Prepairing GroupLens...")
  23. groupLens_movie <- read.csv(returnPath("datasets/groupLens/movies.csv"))
  24. groupLens_movie <- select(tbl_df(groupLens_movie), movieId, title)
  25. # Extract the year
  26. groupLens_movie <- mutate(groupLens_movie, year = as.integer(str_match(title, "([0-9]{4})")[,1]))
  27. groupLens_rating <- read.csv(returnPath("datasets/groupLens/ratings.csv"))
  28. groupLens_rating <- tbl_df(groupLens_rating)
  29. groupLens_rating <- group_by(groupLens_rating, movieId)
  30. groupLens_rating <- summarise(groupLens_rating, rating = mean(rating, na.rm = TRUE))
  31. groupLens <- merge(groupLens_movie, groupLens_rating, by = intersect(names(groupLens_movie), names(groupLens_rating)), all = TRUE)
  32. # Cleanup
  33. remove(groupLens_rating)
  34. remove(groupLens_movie)
  35. print("DONE")
  36. print("Prepairing Netflix...")
  37. # Gonna be a clusterfuck w/ 2GB data (±100 million reviews)
  38. netflix_movie <- read.csv(returnPath("datasets/netflix/movie_titles.csv"))
  39. netflix_movie <- select(tbl_df(netflix_movie), movieId, title, year)
  40. netflix_movie <- mutate(netflix_movie, title_frmt = paste(title, " (", year, ")", sep = ""))
  41. # Create new and empty dataframe for final results
  42. netflix <- data.frame(title_frmt = character(0), year = integer(0), rating = numeric(0))
  43. # We need to loop through every movieId to find its .csv file
  44. # Then we calculate the average rating for the movie and store it in a new data.frame
  45. for (i in 1:nrow(netflix_movie)) {
  46. row <- netflix_movie[i,]
  47. print(i)
  48. # Format the file name
  49. # E.G:
  50. # mv_0000001.csv
  51. # mv_0027640.csv
  52. csv_ <- "mv_"
  53. for(j in nchar(row$movieId):6){
  54. csv_ <- paste(csv_, "0", sep = "")
  55. }
  56. csv_ <- paste(csv_, row$movieId, ".csv", sep = "")
  57. # Prepend the filesystem location (working directory)
  58. csv_ <- paste(wd, "datasets/netflix", csv_, sep = "/")
  59. netflix_rating <- read.csv(csv_)
  60. if(is.null(netflix_rating)) { # If csv is empty => skip it
  61. print(paste("Empty:", csv_))
  62. next
  63. }
  64. netflix_rating <- tbl_df(netflix_rating)
  65. netflix_rating <- summarise(netflix_rating, ratings = mean(rating, na.rm = TRUE))
  66. # Append result to the netflix table
  67. netflix <- bind_rows(netflix, data.frame(title_frmt = as.character(row$title_frmt), year = as.integer(as.character(row$year)), rating = netflix_rating$ratings))
  68. # Cleanup
  69. remove(csv_)
  70. remove(netflix_rating)
  71. remove(j)
  72. }
  73. # Cleanup
  74. remove(netflix_movie)
  75. remove(row)
  76. remove(i)
  77. print("DONE")
  78. print("Done Loading")
  79. print("Working on question no. 1...")
  80. # Define ranges
  81. y = 10
  82. x_min <- min(min(imdb$year, na.rm = TRUE), min(groupLens$year, na.rm = TRUE), min(netflix$year, na.rm = TRUE))
  83. x_max <- max(max(imdb$year, na.rm = TRUE), max(groupLens$year, na.rm = TRUE), max(netflix$year, na.rm = TRUE))
  84. # Define colors
  85. color <- rainbow(3)
  86. imdb_year_avg <- imdb %>%
  87. group_by(year) %>%
  88. summarise(rating = mean(rating, na.rm = TRUE))
  89. groupLens_year_avg <- groupLens %>%
  90. group_by(year) %>%
  91. summarise(rating = mean(rating, na.rm = TRUE) * 2)
  92. netflix_year_avg <- netflix %>%
  93. group_by(year) %>%
  94. summarise(rating = mean(rating, na.rm = TRUE) * 2)
  95. # imdb => rating.x
  96. # groupLens => rating.y
  97. # netflix => rating
  98. yearList <- merge(imdb_year_avg, groupLens_year_avg, by = "year")
  99. yearList <- merge(yearList, netflix_year_avg, by = "year")
  100. yearList <- mutate(yearList, mean = ((rating + rating.x + rating.y) / 3))
  101. png(filename=returnPath("output/Q1.png"), height = 400, width = 900, bg = "white")
  102. plot(yearList$rating.x,
  103. type = "l",
  104. ylim = c(0, y),
  105. col = color[1],
  106. axes = F,
  107. ann = T,
  108. xlab = "Years",
  109. ylab = "Avg. rating",
  110. cex.lab=0.8,
  111. lwd=2,
  112. main = "In what movie release year where the average ratings the highest?"
  113. )
  114. axis(1, at=1:length(yearList$year), labels = yearList$year, pos = 0)
  115. axis(2, las = 1, at = 2*0:y, pos = 1)
  116. lines(yearList$rating.y,
  117. type = "l",
  118. pch=23,
  119. lty = 2,
  120. col = color[2],
  121. lwd = 2
  122. )
  123. lines(yearList$rating,
  124. type = "l",
  125. pch=23,
  126. lty = 3,
  127. col = color[3],
  128. lwd = 2
  129. )
  130. # lines(yearList$mean,
  131. # type = "l",
  132. # pch=23,
  133. # lty = 4,
  134. # col = "yellow",
  135. # lwd = 2
  136. # )
  137. sorted <- arrange(yearList, desc(mean))
  138. highest <- sorted[1,]
  139. highest$mean <- round(highest$mean, digits = 3)
  140. points(
  141. sum(
  142. between(sorted$year, min(sorted$year),highest$year)
  143. ),
  144. highest$mean,
  145. pch = 8,
  146. lwd = 2,
  147. cex = 2
  148. )
  149. text(
  150. sum(
  151. between(sorted$year, min(sorted$year),highest$year)
  152. ) - 0.3,
  153. highest$mean + 0.7,
  154. labels = highest$mean
  155. )
  156. text(
  157. 50,
  158. 10,
  159. labels = paste(
  160. "The highest average rating was in:",
  161. highest$year
  162. )
  163. )
  164. legend(1, 3, set_names, cex = 0.8, col = color, lty=1:3, lwd = 2, bty="n")
  165. dev.off();
  166. print("In what year are the ratings the highest?")
  167. print(paste("That was in:", highest$year, "Score:", highest$mean))
  168. # Cleanup
  169. remove(sorted)
  170. remove(highest)
  171. print("Working on question no. 2...")
  172. netflix_ <- round(mean(yearList$rating), digits = 3)
  173. imdb_ <- round(mean(yearList$rating.x), digits = 3)
  174. groupLens_ <- round(mean(yearList$rating.y), digits = 3)
  175. vct <- c(imdb_, groupLens_, netflix_)
  176. png(filename=returnPath("output/Q2.png"), height = 500, width = 450, bg = "white")
  177. barplot(
  178. vct,
  179. col = color,
  180. names.arg = set_names,
  181. ylim = c(0, y),
  182. axes = TRUE,
  183. xlab = "Data Provider",
  184. ylab = "Avg. score",
  185. main = "Which provider has the highest avrerage score?"
  186. )
  187. text((1 - 0.3), (imdb_ + 0.2), labels = imdb_, col = color[1])
  188. text((2 - 0.1), (groupLens_ + 0.2), labels = groupLens_, col = color[2])
  189. text((3 + 0.1), (netflix_ + 0.2), labels = netflix_, col = color[3])
  190. text(1.5, y - 1, labels = paste("Provider with hightest average score is:", set_names[which.max(vct)]))
  191. dev.off()
  192. # Cleanup
  193. remove(netflix_)
  194. remove(groupLens_)
  195. remove(imdb_)
  196. remove(vct)