Attachment @ IMDB open data movies parsing file_download
2017-01-31
2017
01-31
«imdb_movies_dump.rb»
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
#!/usr/bin/env ruby

require 'stringio'
require 'zlib'

class String
  # http://ruby-doc.org/stdlib-1.9.3/libdoc/zlib/rdoc/Zlib/GzipWriter.html
  # https://gist.github.com/sinisterchipmunk/1335041
  def gzipped(fname = 'file.txt')
    gz_file = StringIO.new

    z = Zlib::GzipWriter.new(gz_file)
    z.mtime     = Time.now
    z.orig_name = fname
    z.write self
    z.close

    gz_file.string
  end # gzipped ----------------------------------------------------------------

  def gunzipped
    gz_file = StringIO.new self
    z = Zlib::GzipReader.new(gz_file)
    z.read
  end # gunzipped --------------------------------------------------------------
end

movies = File.open('movies.list.gz', 'rb').read.gunzipped.force_encoding("iso-8859-1").split("\n"); nil # ~200MB of data

# captures:
#   0: #TITLE (UNIQUE KEY)
#   1: (.*? \(\S{4,}\))                    movie name + year
#   2: (\(\S+\))                           type ex:(TV)
#   3: (\{(.*?) ?(\(\S+?\))?\})            series info ex: {Ally Abroad (#3.1)}
#   4: (.*?)                               episode name ex: Ally Abroad
#   5: ((\(\S+?\))                         episode number ex: (#3.1)
#   6: (\{\{SUSPENDED\}\})                 is suspended?
#   7: (.*)                                year
re = /((.*? \(\S{4,}\)) ?(\(\S+\))? ?(?!\{\{SUSPENDED\}\})(\{(.*?) ?(\(\S+?\))?\})? ?(\{\{SUSPENDED\}\})?)\t+(.*)$/

last_movie = nil
data = movies.map{|movie|
  next unless m = movie.match(re)

  name = m.captures[1].delete('"').sub(/ *\(\S{4}\)$/,'') # remove year and quotes
  year = m.captures[7].sub(/-\S{4}/,'') # remove yyyy from xxxx-yyyy

  next if year.include?('?')
  next unless (2010..2016).include?(year.to_i)

  next if last_movie == name
  last_movie = name

  "#{year}\t#{name}"
}.compact.sort; nil

puts data