Attachment @ IMDB open data movies parsing file_download
2017-01-31
«imdb_movies_dump.rb»
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 | #!/usr/bin/env ruby require 'stringio' require 'zlib' class String # http://ruby-doc.org/stdlib-1.9.3/libdoc/zlib/rdoc/Zlib/GzipWriter.html # https://gist.github.com/sinisterchipmunk/1335041 def gzipped(fname = 'file.txt') gz_file = StringIO.new z = Zlib::GzipWriter.new(gz_file) z.mtime = Time.now z.orig_name = fname z.write self z.close gz_file.string end # gzipped ---------------------------------------------------------------- def gunzipped gz_file = StringIO.new self z = Zlib::GzipReader.new(gz_file) z.read end # gunzipped -------------------------------------------------------------- end movies = File.open('movies.list.gz', 'rb').read.gunzipped.force_encoding("iso-8859-1").split("\n"); nil # ~200MB of data # captures: # 0: #TITLE (UNIQUE KEY) # 1: (.*? \(\S{4,}\)) movie name + year # 2: (\(\S+\)) type ex:(TV) # 3: (\{(.*?) ?(\(\S+?\))?\}) series info ex: {Ally Abroad (#3.1)} # 4: (.*?) episode name ex: Ally Abroad # 5: ((\(\S+?\)) episode number ex: (#3.1) # 6: (\{\{SUSPENDED\}\}) is suspended? # 7: (.*) year re = /((.*? \(\S{4,}\)) ?(\(\S+\))? ?(?!\{\{SUSPENDED\}\})(\{(.*?) ?(\(\S+?\))?\})? ?(\{\{SUSPENDED\}\})?)\t+(.*)$/ last_movie = nil data = movies.map{|movie| next unless m = movie.match(re) name = m.captures[1].delete('"').sub(/ *\(\S{4}\)$/,'') # remove year and quotes year = m.captures[7].sub(/-\S{4}/,'') # remove yyyy from xxxx-yyyy next if year.include?('?') next unless (2010..2016).include?(year.to_i) next if last_movie == name last_movie = name "#{year}\t#{name}" }.compact.sort; nil puts data |