main.R 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377
  1. # (c) Deben Oldert
  2. # Checks the difference between multiple movie rating from different websites
  3. # Load libraries
  4. suppressMessages(library(dplyr))
  5. suppressMessages(library(stringr))
  6. # Set global vars
  7. wd <- getwd()
  8. # Set common functions
  9. returnPath <- function(pth) {
  10. return(paste(wd, pth, sep = "/"))
  11. }
  12. # 1. IMDB
  13. # 2. GroupLens
  14. # 3. Netflix
  15. set_names <- c("IMDB", "GroupLens", "Netflix")
  16. # Loading .CSV's
  17. print("Prepairing IMDB...")
  18. imdb <- read.csv(returnPath("datasets/imdb/imdb.csv"), row.names=1)
  19. imdb <- select(tbl_df(imdb), title, rating, year)
  20. imdb <- mutate(imdb, title = paste(title, " (", year, ")", sep = ""))
  21. print("DONE")
  22. print("Prepairing GroupLens...")
  23. groupLens_movie <- read.csv(returnPath("datasets/groupLens/movies.csv"))
  24. groupLens_movie <- select(tbl_df(groupLens_movie), movieId, title)
  25. # Extract the year
  26. groupLens_movie <- mutate(groupLens_movie, year = as.integer(str_match(title, "([0-9]{4})")[,1]))
  27. groupLens_rating <- read.csv(returnPath("datasets/groupLens/ratings.csv"))
  28. groupLens_rating <- tbl_df(groupLens_rating)
  29. groupLens_rating <- group_by(groupLens_rating, movieId)
  30. groupLens_rating <- summarise(groupLens_rating, rating = mean(rating, na.rm = TRUE))
  31. groupLens <- merge(groupLens_movie, groupLens_rating, by = intersect(names(groupLens_movie), names(groupLens_rating)), all = TRUE)
  32. # Cleanup
  33. remove(groupLens_rating)
  34. remove(groupLens_movie)
  35. print("DONE")
  36. print("Prepairing Netflix...")
  37. # Gonna be a clusterf*ck w/ 2GB data (±100 million reviews)
  38. netflix_movie <- read.csv(returnPath("datasets/netflix/movie_titles.csv"))
  39. netflix_movie <- select(tbl_df(netflix_movie), movieId, title, year)
  40. netflix_movie <- mutate(netflix_movie, title_frmt = paste(title, " (", year, ")", sep = ""))
  41. # Create new and empty dataframe for final results
  42. netflix <- data.frame(title = character(0), year = integer(0), rating = numeric(0))
  43. # We need to loop through every movieId to find its .csv file
  44. # Then we calculate the average rating for the movie and store it in a new data.frame
  45. percent_ <- 0
  46. max_ <- nrow(netflix_movie)
  47. for (i in 1:max_) {
  48. row <- netflix_movie[i,]
  49. # Since this takes a long time we print the percentage so you know it's still running
  50. p_ <- as.integer((i / max_) * 100)
  51. if(p_ > percent_) {
  52. print(
  53. paste(
  54. p_,
  55. "%",
  56. sep = ""
  57. )
  58. )
  59. percent_ <- p_
  60. }
  61. # Format the file name
  62. # E.G:
  63. # mv_0000001.csv
  64. # mv_0027640.csv
  65. csv_ <- "mv_"
  66. for(j in nchar(row$movieId):6){
  67. csv_ <- paste(csv_, "0", sep = "")
  68. }
  69. csv_ <- paste(csv_, row$movieId, ".csv", sep = "")
  70. # Prepend the filesystem location (working directory)
  71. csv_ <- paste(wd, "datasets/netflix", csv_, sep = "/")
  72. netflix_rating <- read.csv(csv_)
  73. if(is.null(netflix_rating)) { # If csv is empty => skip it
  74. print(paste("Empty:", csv_))
  75. next
  76. }
  77. netflix_rating <- tbl_df(netflix_rating)
  78. netflix_rating <- summarise(netflix_rating, ratings = mean(rating, na.rm = TRUE))
  79. # Append result to the netflix table
  80. suppressWarnings(netflix <- bind_rows(netflix,
  81. data.frame(title = as.character(row$title_frmt),
  82. year = as.integer(as.character(row$year)),
  83. rating = netflix_rating$ratings
  84. )
  85. )
  86. )
  87. # Cleanup
  88. remove(csv_)
  89. remove(netflix_rating)
  90. remove(j)
  91. remove(p_)
  92. }
  93. # Cleanup
  94. remove(netflix_movie)
  95. remove(row)
  96. remove(i)
  97. remove(percent_)
  98. remove(max_)
  99. print("DONE")
  100. print("Done Loading")
  101. print("Working on question no. 1...")
  102. # Define ranges
  103. y = 10
  104. x_min <- min(min(imdb$year, na.rm = TRUE), min(groupLens$year, na.rm = TRUE), min(netflix$year, na.rm = TRUE))
  105. x_max <- max(max(imdb$year, na.rm = TRUE), max(groupLens$year, na.rm = TRUE), max(netflix$year, na.rm = TRUE))
  106. # Define colors
  107. color <- rainbow(3)
  108. # Calculate average score for each year per provider
  109. # Make all scores from 1-10
  110. imdb_year_avg <- imdb %>%
  111. group_by(year) %>%
  112. summarise(rating = mean(rating, na.rm = TRUE))
  113. groupLens_year_avg <- groupLens %>%
  114. group_by(year) %>%
  115. summarise(rating = mean(rating, na.rm = TRUE) * 2)
  116. netflix_year_avg <- netflix %>%
  117. group_by(year) %>%
  118. summarise(rating = mean(rating, na.rm = TRUE) * 2)
  119. # imdb => rating.x
  120. # groupLens => rating.y
  121. # netflix => rating
  122. # Merge all the average so we only have years where all 3 provider have data from
  123. yearList <- merge(imdb_year_avg, groupLens_year_avg, by = "year")
  124. yearList <- merge(yearList, netflix_year_avg, by = "year")
  125. yearList <- mutate(yearList, mean = ((rating + rating.x + rating.y) / 3))
  126. # Start the image output for question 1
  127. png(filename=returnPath("output/Q1.png"), height = 400, width = 900, bg = "white")
  128. # Create a line graph
  129. plot(yearList$rating.x,
  130. type = "l",
  131. ylim = c(0, y),
  132. col = color[1],
  133. axes = F,
  134. ann = T,
  135. xlab = "Years",
  136. ylab = "Avg. rating",
  137. cex.lab=0.8,
  138. lwd=2,
  139. main = "In what movie release year where the average ratings the highest?"
  140. )
  141. # Format the X and Y axis
  142. axis(1, at=1:length(yearList$year), labels = yearList$year, pos = 0)
  143. axis(2, las = 1, at = 2*0:y, pos = 1)
  144. # Add the other lines (grouplens + netflix)
  145. lines(yearList$rating.y,
  146. type = "l",
  147. pch=23,
  148. lty = 2,
  149. col = color[2],
  150. lwd = 2
  151. )
  152. lines(yearList$rating,
  153. type = "l",
  154. pch=23,
  155. lty = 3,
  156. col = color[3],
  157. lwd = 2
  158. )
  159. # This is the mean line (average of all 3 providers)
  160. # Uncomment below to see the result as a 4th line
  161. # lines(yearList$mean,
  162. # type = "l",
  163. # pch=23,
  164. # lty = 4,
  165. # col = "yellow",
  166. # lwd = 2
  167. # )
  168. # Sort the dataframe on descending the mean column (highest mea above)
  169. sorted <- arrange(yearList, desc(mean))
  170. # Get the first row (highest)
  171. highest <- sorted[1,]
  172. # Round the mean to 3 digits
  173. highest$mean <- round(highest$mean, digits = 3)
  174. # Add point in the line graph to show the point with the highest rating
  175. points(
  176. sum(
  177. between(sorted$year, min(sorted$year),highest$year)
  178. ),
  179. highest$mean,
  180. pch = 8,
  181. lwd = 2,
  182. cex = 2
  183. )
  184. # Display the rating above the point we just created
  185. text(
  186. sum(
  187. between(sorted$year, min(sorted$year),highest$year)
  188. ) - 0.3,
  189. highest$mean + 0.7,
  190. labels = highest$mean
  191. )
  192. # Answer the question
  193. text(
  194. 50,
  195. 10,
  196. labels = paste(
  197. "The highest average rating was in:",
  198. highest$year
  199. )
  200. )
  201. # Draw a f*cking legend (not a picute of me)
  202. legend(1, 3, set_names, cex = 0.8, col = color, lty=1:3, lwd = 2, bty="n")
  203. # Save the image
  204. suppressMessages(dev.off())
  205. # Pint question + answer to console
  206. print("In what year are the ratings the highest?")
  207. print(paste("That was in:", highest$year, "Score:", highest$mean))
  208. # Cleanup
  209. remove(sorted)
  210. remove(highest)
  211. # Do question no. 2
  212. print("Working on question no. 2...")
  213. # Get for each provider the mean of all ratings and round to 3 digits
  214. netflix_ <- round(mean(yearList$rating), digits = 3)
  215. imdb_ <- round(mean(yearList$rating.x), digits = 3)
  216. groupLens_ <- round(mean(yearList$rating.y), digits = 3)
  217. # Put 'em in a vector
  218. vct <- c(imdb_, groupLens_, netflix_)
  219. # Start image output for question 2
  220. png(filename=returnPath("output/Q2.png"), height = 500, width = 450, bg = "white")
  221. # Create a new bar graph
  222. barplot(
  223. vct,
  224. col = color,
  225. names.arg = set_names,
  226. ylim = c(0, y),
  227. axes = TRUE,
  228. xlab = "Data Provider",
  229. ylab = "Avg. score",
  230. main = "Which provider has the highest avrerage score?"
  231. )
  232. # Print the mean rating for each provider above the bar
  233. text((1 - 0.3), (imdb_ + 0.2), labels = imdb_, col = color[1])
  234. text((2 - 0.1), (groupLens_ + 0.2), labels = groupLens_, col = color[2])
  235. text((3 + 0.1), (netflix_ + 0.2), labels = netflix_, col = color[3])
  236. # Asnwer the question
  237. text(1.5, y - 1, labels = paste("Provider with hightest average score is:", set_names[which.max(vct)]))
  238. # Save the image
  239. suppressMessages(dev.off())
  240. # Print the question + answer
  241. print("Which provider has the highest avrerage score?")
  242. print(paste("Provider with hightest average score is:", set_names[which.max(vct)]))
  243. # Cleanup
  244. remove(netflix_)
  245. remove(groupLens_)
  246. remove(imdb_)
  247. remove(vct)
  248. print("Working on question 3...")
  249. # Create a new list and order it by rating of netflix desc
  250. movieList <- merge(imdb, groupLens, by = "title")
  251. movieList <- merge(movieList, netflix, by = "title")
  252. movieList <- select(movieList, title, rating.x, rating.y, rating)
  253. movieList <- mutate(movieList, rating.y = rating.y * 2, rating = rating *2)
  254. movieList <- arrange(movieList, desc(rating))
  255. # Get the top 5 of the list
  256. top5 <- movieList[1:5,]
  257. # Put in a data frame with 3 rows where column name is movie title
  258. mrx5 <- data.frame(y = 1:3)
  259. # Insert data in data frame
  260. for (i in 1:nrow(top5)) {
  261. row <- top5[i,]
  262. mrx5[[row$title]] = c(
  263. row$rating,
  264. row$rating.x,
  265. row$rating.y
  266. )
  267. }
  268. # Delete empty row
  269. mrx5$y <- NULL
  270. # Start image output
  271. png(filename=returnPath("output/Q3.png"), height = 600, width = 600, bg = "white")
  272. # Plot the graph
  273. barplot(
  274. as.matrix(mrx5),
  275. beside = TRUE,
  276. col = color,
  277. ylim = c(0, 10),
  278. names.arg = c("", "", "", "", ""),
  279. main = "How much differce the top 5 of Netflix with other providers?",
  280. ylab = "Score"
  281. )
  282. # Add movie titles below the graph
  283. text(
  284. c(2.5, 6.5, 10.5, 14.5, 18.5),
  285. par("usr")[3] - 0.3,
  286. srt=20,
  287. adj=1,
  288. labels=names(mrx5),
  289. xpd=T,
  290. cex=0.6
  291. )
  292. # Add legend to graph
  293. legend(6, 10, rev(set_names), cex = 0.8, fill = color, bty="n")
  294. # Save the image
  295. suppressMessages(dev.off())
  296. # Cleanup
  297. remove(mrx5)
  298. remove(top5)
  299. remove(i)
  300. print("The answer of question 3 is in graph Q3")
  301. print("You can find the graphs in the output folder.")