Convert NetFlix Prize Data to CSV
Tuesday, October 3rd, 2006Here is a simple Ruby script to convert the NetFlix Prize training data files into a single denormalized CSV file.
require "CSV"
# make a movie lookup table
movies = Array.new
f = File.open('movie_titles.txt', 'r')
f.each_line do |line|
row = line.chomp.split(',', 3)
movies[ row.shift.to_i ] = row
end
f.close
# read all the ratings file and denormalize into one csv file
out = CSV::Writer.create(File.open(’ratings.txt’, ‘w’))
in_files = Dir[ "training_set/mv_*.txt" ]
in_files.each do |file|
f = File.open(file, ‘r’)
# first line is the movie id
movie_id = f.gets.to_i
rating = [ "", "", "", movie_id, movies[ movie_id ] ].flatten
printf “%5d - %s\n”, rating[ 3 ], rating[ 5 ]
f.each_line do |line|
rating[0..2] = line.chomp.split(’,')
out << rating
end
f.close
end