#!/usr/bin/ruby
#TODO: if it's a dir add / at the end ?
# that would make sense for cache. now pmatch test and pmatch test/
# will yield different hashes
#TODO: handle hard and soft links. how to handle hardlinks? take first one and ignore the rest?
#TODO: how should SC deep & shallow really work ?!
#Windows port:
# user home directory
# use File.join
#TODO: warn if last mod time in the future

#standard
require 'optparse'
require 'ostruct'
require 'singleton'
require 'pathname'
require 'digest/md5'

#optional
require 'pp'
#gems
require 'rubygems'
require 'log4r'

#clear cache if it contains more than CACHE_CLEAR_IF_FILES file
CACHE_CLEAR_IF_FILES=50
COMMENT='#'
PMATCH_VERSION='0.3.1'

#functions should remove > 1 result if results are equal
class SecondaryChoice
  attr_reader :methods

  def initialize(methods, hash={})
    @methods = []
    hash[:add_random] = true if hash[:add_random].nil? 
    random_included = !hash[:add_random]

    #check if all choices are valid
    methods.each do |method|
      if self.respond_to?(("choice_" + method + "!").to_sym)
        random_included = true if method == 'random'
        @methods << ("choice_" + method + "!").to_sym
      else  
        $l.warn "Method #{method} does not exist."        
      end
    end
    @methods << :choice_random! if !random_included 
  end
    
  #remove everything but the shortest string(s) from the array
  def choice_short!(list)
    return if list.length < 2
    shortest = list.min {|a,b| a.fname.length <=> b.fname.length }
    shortest_length = shortest.fname.length
    list.reject! {|l| l.fname.length > shortest_length }
  end #def choose

  #remove everything but the longest string(s) from the array
  def choice_long!(list)
    return if list.length < 2
    shortest = list.max {|a,b| a.fname.length <=> b.fname.length }
    shortest_length = shortest.fname.length
    list.reject! {|l| l.fname.length < shortest_length }
  end #def choose

  #leave only one, random element in the list
  def choice_random!(list)
    return if list.length < 2
    i = rand(list.length)
    $l.debug "Random index: #{i}"
    Stats.instance.sc_random_used += 1
    list.replace([list[i]])
  end

  #leave files with deepest path
  def choice_deep!(list)
    return if list.length < 2
    longest = list.max {|a,b| a.dir.level <=> b.dir.level }
    longest_length = longest.dir.level
    list.reject! {|l| l.dir.level < longest_length }
  end
    
  #leave files with most shallow path
  def choice_shallow!(list)
    return if list.length < 2
    shortest = list.min {|a,b| a.dir.level <=> b.dir.level }
    shortest_length = shortest.dir.level
    list.reject! {|l| l.dir.level > shortest_length }
  end

  #leave files that have most siblings in their directory
  def choice_dirfull!(list)
    return if list.length < 2
    longest = list.max {|a,b| a.dir.files_count <=> b.dir.files_count }
    longest_length = longest.dir.files_count
    list.reject! {|l| l.dir.files_count < longest_length }
  end

  #leave files that have least siblings in their directory
  def choice_dirempty!(list)
    return if list.length < 2
    shortest = list.min {|a,b| a.dir.files_count <=> b.dir.files_count }
    shortest_length = shortest.dir.files_count
    list.reject! {|l| l.dir.files_count > shortest_length }
  end
    
  def choose!(list)
    @methods.each do |method|
      break if list.length < 2
      $l.debug "Filtering using #{method}"
      send(method, list)
    end
  end
end #class SecondaryChoice

#responsible for computing hash of the file
class UniversalHash
  include Singleton

  def initialize
    #first of all check if path to md5path
    if md5sum_ok?(AppConfig.instance.md5path)
      @utility_path = AppConfig.instance.md5path
      @method = :external_md5
    else
      @method = :internal_md5
    end
    #   puts @method
    #    exit
  end
  
  def md5sum_ok? (md5sum)
    #try to run on yourself
    md5 = `md5sum #{escape(__FILE__)}`
    return false if $? != 0
    return false if md5[0..31] !~ /^[a-fA-F0-9]{32}/
    return true
  end
  
  #only this function is used
  #it will wrap 'real' hashing implementation
  def hash(path)
    #TODO: just one option for now
    external_md5(path)
  end
  
  def escape(path)
    newpath = path.gsub(/([\\'` ";\(\)&<>$#!\|\{\}])/,%(\\\\\\1))
    return newpath
  end
  
  def external_md5(path)
    command = "#{@utility_path} #{escape(path)}" 
    md5 = `#{command}`
    if $? != 0
      $l.error command
      raise "md5 command quit with status #{$?}"
    end
    #sanity check
    if md5.split("\n").length > 1
      $l.error command
      raise 'md5sum returned more than one line!'
    end
    #sometimes there is a \ in front of the MD5
    if md5[0..0] == '\\'
      return md5[1..32]
    else
      return md5[0..31]
    end
  end
end

class MFile
  attr_reader :md5, :dir, :mtime, :path
  def initialize(path)
    @path = path
    @size = File.size(path)
    @mtime = File.mtime(path)
    @md5 = false
    @dir = PMDirectoryManager.instance.for_file(path)
  end
  
  def size
    @size
  end
  
  def to_s
    UniversalHash.instance.escape(@path)
  end
  
  def fullpath
    return UniversalHash.instance.escape(@dir.realpath.to_s + '/' + fname)
  end
  
  def calculate_md5
    return @md5 if @md5
    #change ' to \'
    @md5 = UniversalHash.instance.hash(@path)
    return @md5
  end

  def fname
    return File.basename(@path)
  end
  
  def inside? path
    @path =~ %r|^#{Regexp.escape(path)}|
  end
end #MFile

#sometimes decision will be made based on the directory
#the file is in
class PMDirectory < Pathname
  attr_reader :level 
  def initialize(path, hash=[])
    super(path)

    if !directory?
      raise "#{path} is not a directory!"
    end
    #initialize @files_count but only
    #if need_files_count
    @files_count = -1
    @level = path.count('/')
  end
  
  #no of files inside this directory 
  def files_count
    if @files_count == -1
      @files_count = 0
      Pathname.glob("#{self}/*") do |name|
        @files_count += 1 if File.file? name  
      end
    end
    
    return @files_count
  end #def files_count
  
end

class PMDirectoryManager
  include Singleton

  def initialize
    @directories = {}
  end
  
  #gets directory (MDirectory) for a given file
  def for_file(file_path)
    #should be always a file
    if !File.file? file_path
      raise "#{file_path} is not a file, quitting"
    end
    
    #TODO: portability, speed
    dir_path = file_path.sub(/(.*)\/(.*)$/,'\1')
    #    pp @directories
    if !@directories[dir_path]
      parent = PMDirectory.new(dir_path)
      @directories[dir_path] = parent
    end
    return @directories[dir_path]
  end #def for_file
end #class PMDirectoryManager

class Manager
  attr_reader :group_by_hash, :remove_by_hash, :lucky_by_hash, :group_by_size, :uniq

  def initialize(cfg)
    #path => MFile
    @files = {}
    #groups
    #MD5 => [file1,file2,...]
    @group_by_hash = {}
    @remove_by_hash = {}
    @lucky_by_hash = {}
    @group_by_size = {}
    @from_cache = false
    @max_mtime = Time.at(0)
    @uniq = []
    
    #create Secondary Choice object for me
    @sc = SecondaryChoice.new(cfg.secondary)
    @dir_manager = PMDirectoryManager.instance
    @cache = Cache.new
    
    #those will always get executed, so we will check
    #latest mtime of all files and dirs
    add_dirs(cfg.directories)
    add_files(cfg.files)
   
    #see if the cache can be used
    if @cache.hit?(@max_mtime)
      @group_by_hash = @cache.get
      @from_cache = true
      $l.warn "Using cached information"
    else
      @cache.clear
      find_same_sizes
      group_by_size.each do |k,v|
        find_same_MD5(v)
      end
    end
    
    find_safe_to_remove
    
    #generate statistics
    Stats.instance.files = @files.length
    
    @cache.save(@group_by_hash) if !@from_cache
    @cache.clear(true) if cfg.clear_cache
  end #def initialize
  
  def add_dirs(dirs)
    cfg = AppConfig.instance
    
    
    
    
    dirs = [dirs] if !dirs.respond_to? 'each'
    dirs.each do |dir|
      #only for tracking time of last mod
      #puts "checking #{dir}"
      @max_mtime = File.mtime(dir) if File.mtime(dir) > @max_mtime
      #get all the files from directory
      #it will not follow symlinks - good
      Dir.glob(dir + '/**/*') do |fname|
        ignore = false
        @max_mtime = File.mtime(fname) if File.mtime(fname) > @max_mtime
        if File.directory?(fname)
          Stats.instance.dirs += 1
          next
        end

        if File.file? fname
          if @files.has_key? fname
            raise "File already parsed: '#{fname}'!"
          end
          if File.size(fname) <= 0
            $l.warn "Ignoring empty file: '#{fname}'"
            next
          end
          if !cfg.slinks && File.symlink?(fname)
            $l.info "Ignoring symbolic link: '#{fname}'"
            next
          end
          #check if file should be excluded
          cfg.exclude.each do |re|
            if fname =~ re
              $l.warn "File excluded: #{fname}"
              ignore = true
              break
            end
          end

          next if ignore

          #finally, add file to our structure
          @files[fname] = MFile.new(fname)
          #@max_mtime = @files[fname].mtime if @files[fname].mtime > @max_mtime 
        end
      end
    end #dirs.each
    
  end #add_dirs
  
  def add_files(files)
    files = [files] if !files.respond_to? 'each'
    files.each do |fname|
      if @files.has_key? fname
        $l.error "File already parsed: '#{fname}', quitting!"
        exit 1
      end
      if File.size(fname) <= 0
        $l.warn "Ignoring empty file: '#{fname}'"
        next
      end
      @files[fname] = MFile.new(fname)
    end
  end
  
  def find_same_sizes
    #first group them by size
    #shallow copy of the list 
    files = @files.dup
    #get the keys
    keys = files.keys
    i=0
    $l.debug "Comparing file sizes"
    while i < keys.length
      size = files[keys[i]].size
      j = i + 1
      uniq = true
      while j < keys.length
        if size == files[keys[j]].size
          uniq = false
          @group_by_size[size] ||= []
          @group_by_size[size] << files[keys[j]] 
          #remove second file from list
          keys.delete_at j
          j -= 1
        end
        j += 1 
      end
      #if it was a duplicate, also add file number i
      if !uniq
        @group_by_size[size] << files[keys[i]]
        #and remove it from the list
        keys.delete_at i
        i -= 1
      end
      i += 1
    end #external while
    #    pp @group_by_size
    #    pp keys
    #whatever is left in keys is uniq
    @uniq += keys
  end #find_same_sizes
  
  #Updates @grouped_by_md5 and @uniq
  def find_same_MD5(files)
    #no of links to the same inode
    #File.stat("testfile").nlink
    groups = {}
    files.each do |f|
      f.calculate_md5
      groups[f.md5] ||= []
      groups[f.md5] << f
    end
    grouped_by_md5 = {}
    uniq = []
    groups.each do |k,v|
      if v.length > 1 #more than 1 the same file
        Stats.instance.duplicates += v.length
        #how much space will we save?
        Stats.instance.space_reclaim += (v.length-1)*v[0].size
        @group_by_hash[k] ||= []
        @group_by_hash[k] += v
      else
        uniq << v
      end
    end
    @uniq += uniq
    return uniq
  end #find_same_MD5
  
  def find_safe_to_remove
    cfg = AppConfig.instance
    #pp m.group_by_hash
    #by default prioritize by the order of paths given on commandline
    @group_by_hash.each do |md5,files|
      lucky = []
      cfg.priority.each do |prior|
        #always iterate all files for given priority directory
        #to find more than one file from the same (sub)directory
        files.each do |file|
          if file.inside? prior #stop on first match
            #        puts "Maybe will not remove: #{file}"
            lucky << file
          end
        end
        break if lucky.length > 0
      end #cfg.priority.each
      #pp lucky
      #make the decision about removing files

      lucky = files.dup if lucky.length == 0 #if none then all ;)
   
      if lucky.length == 1
        #the choice is simple!
        lucky.replace [lucky[0]]
      elsif lucky.length > 1
        #secendary mechanism for choosing files
        #        puts "Before:"
        #        pp lucky
        @sc.choose!(lucky)
        #        puts "After:"
        #        pp lucky
      else
        #ops
        raise "Something went wrong while processing #{files}"
      end
      if lucky.length > 1
        raise "More than 1 'original' file"
      end
      @lucky_by_hash[md5]  = lucky[0]
      #remove lucky guy from the list
      @remove_by_hash[md5] = files - lucky
    end #m.group_by_hash.each 
    
  end #def find_safe_to_remove

  def generate_output
    cfg = AppConfig.instance
    @remove_by_hash.each do |md5, files|
      files.each do |d|
        o = @lucky_by_hash[md5]
        out = eval(cfg.generate)
        cfg.outfile.puts(out)
      end
    end
    
    #display stats at the very end

    if @from_cache
      cfg.outfile.puts COMMENT + "No scan done, cache used"
    else
      cfg.outfile.puts Stats.instance.summary(COMMENT)
    end
  end #def generate_output
  
end

class AppConfig
  include Singleton

  def all_options
    @options
  end
  
  def method_missing(name, *args)
    return @options.send(name) if @options.respond_to? name
    raise "Missing method #{name} in OptParser"
  end
  
  #
  # Return a structure describing the options.
  #
  def parse(args = ARGV)
    # The options specified on the command line will be collected in *options*.
    # We set default values here.
    @options = OpenStruct.new
    @options.verbose = false
    @options.recursive = true
    @options.slinks = false
    @options.hlinks = false
    @options.directories = []
    @options.files = []
    @options.priority = []
    @options.secondary = []
    @options.generate = '"rm #{d}"'
    @options.outfile = $stdout
    @options.exclude = []
    @options.md5path = 'md5sum'
    @options.by_inode = false
    @options.cache = :on
    @options.clear_cache = false
    @options.cache_force = false
    
    options = OptionParser.new do |opts|
      opts.banner = "Usage: #{$0} [options] dir1 dir2 dir3 ...
Or:    #{$0} -C clear"

      opts.separator ""
      opts.separator "Specific options:"

      opts.on("-v", "--verbose", "Run verbosely") do |v|
        @options.verbose = v
      end
=begin NOT IMPLEMENTED YET
      opts.on("-l", "--soft-links", "Don't ignore soft links") do |v|
        @options.slinks = v
      end

      opts.on("-L", "--hard-links", "Don't ignore hard links") do |v|
        @options.hlinks = v
      end
=end
      opts.on("-q", "--quiet", "Run quietly") do |v|
        @options.quiet = v
      end
      
      opts.on("-e", "--exclude PATTERN", Array,
        "Exclude files matched by regular expressions") do |v|
        @options.exclude << /#{v}/i
      end
=begin NOT IMPLEMENTED YET      
      opts.on("-r", "--[no-]recursive", "Follow directories recursively") do |v|
        @options.recursive = v
      end

      opts.on("-a", "--analyze", "Analyze mode") do |v|
        @options.analyze = v
      end
=end

      # List of arguments.
      #      opts.on("--list x,y,z", Array, "Example 'list' of arguments") do |list|
      #        options.list = list
      #      end
        
      # Optional argument with keyword completion.
      opts.on("-s","--secondary-choice x,y,z", Array,
        "Which files should I prefer? Possible values: " +
          "short, long, deep, shallow, dirfull, dirempty, random") do |v|
        @options.secondary = v
      end
    
      opts.on("-c","--command COMMAND","Command to display for every (but one) non-unique file") do |v|
        @options.generate = "\"#{v}\""
      end
      
      opts.on("-f","--outfile FILE","File to save generated statements. Will overwrite existing file!") do |v|
        @options.outfile = File.new(v,'w')
      end
      
      opts.on("--md5-path PATH","Path to md5sum utility") do |v|
        @options.md5path = v
      end

      opts.on("-C","--cache OPTION", [:clear,:on,:off,:force],
        "Available OPTIONs: clear, off, on, force. Default: on.") do |v|
        @options.cache = v
        @options.clear_cache = true if v == :clear
        @options.cache_force = true if v == :force
      end
=begin NOT IMPLEMENTED YET
      opts.on("-i","--use-inodes","Use it if you have problems with weird file names") do |v|
        @options.by_inode = v
      end
=end
    end

    options.parse!(args)
    if args.length < 1
      #maybe use want to clear cache only?
      if @options.cache==:clear
        puts "Clearing the cache and exitting"
        cache = Cache.new
        cache.clear(true)
        exit 0
      end
      puts "Provide at least one directory"
      puts options
      exit 1
    end
  
    $l.level = Log4r::DEBUG if @options.verbose
    $l.level = Log4r::ERROR if @options.quiet
      
    args.each do |path|
      if !File.exist? path
        $l.warn "Can't read '#{path}', I'm ignoring it."
        next
      end
      if File.directory? path
        if dup = dir_child?(path)
          $l.warn "Adding dir and sub-dir in this order doesn't make any sense."
          $l.warn "Ignoring '#{path}' because of '#{dup}'"
        elsif  dup = dir_parent?(path)
          #remove subfolder, add it's parent
          @options.priority << path
          $l.debug "Using '#{path}' for priority only"
          @options.directories.delete dup
          @options.directories << path
        else #totaly different
          @options.directories << path 
          @options.priority << path
        end
      end
      @options.files << path if File.file? path
    end #'?'
  end  # initialize()
  
  def dir_child? path
    @options.directories.each do |dir|
      if path =~ %r|^#{Regexp.escape(dir)}|
        return dir
      end
    end
    return false
  end #dir_child?

  def dir_parent? path
    @options.directories.each do |dir|
      if dir =~ %r|^#{Regexp.escape(path)}|
        return dir
      end
    end
    return false
  end #dir_child?  
end  # class OptparseExample

#gathering some basic statistics during script execution
#generating summary
class Stats
  include Singleton
  attr_accessor :sc_random_used, :dirs, :files
  attr_accessor :duplicates, :space_reclaim

  def initialize
    @dirs = 0
    @files = 0
    @sc_random_used = 0
    @duplicates = 0
    @space_reclaim = 0
  end

  def summary(prefix='')
    str = ""
    str += prefix+"Files scanned:\t\t#{@files}\n" if @files
    str += prefix+"Directories scanned:\t#{@dirs}\n" if @dirs
    str += prefix+"Duplicates found:\t#{@duplicates}\n" if @duplicates
    str += prefix+"Space to reclaim:\t#{@space_reclaim.to_s.gsub(/(\d)(?=(\d\d\d)+(?!\d))/, "\\1,")} B\n" if @space_reclaim
    str += prefix+"Random choice used:\t#{@sc_random_used} times\n" if @sc_random_used
  end
end #class Stats


#the same cache can be used if following opitons match:
#cwd, recursive, hlinks, slinks, directories, files, exclude
class Cache
  def initialize
    @dump_struct = {}
    @check_options =[:recursive,:hlinks,:slinks,:directories,:files]
    @cache_dir = ENV["HOME"]+"/.pmatch"
    @digest = hash_options
    @cfg = AppConfig.instance
    @l = Log4r::Logger.new 'cache'
#    @l.outputters = Log4r::FileOutputter.new('file',:filename=>'cache.log',:truncate=>true)
    @l.level = Log4r::DEBUG
    
    #do the new directory if one doesn't exist
    if File.writable?(@cache_dir) && File.directory?(@cache_dir) 
      #all OK
      @enabled = true
    else
      if File.exists?(@cache_dir) #oops
        @l.error "I can't write to #{@cache_dir}. Cache disabled."
        @enabled = false
      else
        Dir.mkdir(@cache_dir)
        @enabled = true
      end
    end
    @l.debug "Cache init, @enabled=#{@enabled}"
  end
  
  #scans all directories to see if they changed 
  #since cacheing time 
  def checkFS
  end
  
  #saves object and metadata
  def save(cache)
    metadata = {:exclude=>@cfg.exclude}
    fullcache = {:cache=>cache,:metadata=>metadata}
    file = File.new(@cache_dir + "/" + @digest,'w')
    file.write(Marshal.dump(fullcache))
    file.close
  end

  #returns already grouped list 
  def get
    #pp @fullcache
    filter = @cfg.exclude - @fullcache[:metadata][:exclude]
    filter.each do |re|
      @fullcache[:cache].each do |k,group|
        group.reject! { |mfile| mfile.path =~ re }
      end
    end

    @fullcache[:cache].reject! {|k,v| v.length < 2}
    
    return @fullcache[:cache]    
  end
  
  def hit?(latest_fs_change)
    @l.debug "Is cache hit?"
    if !@digest || !@enabled
      return false 
    end
    @l.debug "@digest and @enabled true"
    
    #check in user's home directory
    if !File.exist? @cache_dir + "/" + @digest
      @l.debug "No cache file: #{@cache_dir}/#{@digest}"
      return false 
    end
    @l.debug "File #{@cache_dir}/#{@digest} exists"
    
    file = File.new(@cache_dir + "/" + @digest)
    @fullcache = Marshal.restore(file)
    file.close

    #check excludes - maybe something was excluded when cache was saved
    #but it's not excluded now
    cache_excludes =  @fullcache[:metadata][:exclude]
    #pp return false 
    if (cache_excludes - @cfg.exclude).length > 0
      return false 
    end
    @l.debug "No excludes in cached version that are not used now"
    
    cache_time = File.mtime(@cache_dir + "/" + @digest)
    @l.debug "latest_fs_change=#{latest_fs_change}, cache_time=#{cache_time}"
    if latest_fs_change >= cache_time 
      return false if !@cfg.cache_force
      #cache was forced and FS have changed!
      $l.warn "I've detected some changes in scanned directories but you forced using cache"
    end
    
    #SUCCESS
    @l.debug "Success: cache hit!"
    @l.debug @fullcache
    return true
  end
  
  #clears cache if directory contains more than CACHE_CLEAR_IF_FILES files
  #or clear option is given
  def clear(force=false)
    files = Dir.glob(@cache_dir+"/"+"[a-f0-9]"*32)
    
    if force || files.length > CACHE_CLEAR_IF_FILES
      files.each {|f| File.delete(f)}
    end
  end #def clear
  
  #create short hash from all given options
  def hash_options()
    digest = Digest::MD5.new
    digest << Dir.pwd
    #because of the excludes number of files will differ
    #    digest << files_count.to_s
    
    @check_options.each do |option|
      digest << AppConfig.instance.send(option).to_s
    end
    
    return digest.hexdigest
  end
end

if __FILE__ =~ /#{$0}$/
  $l = Log4r::Logger.new 'main'
  $l.outputters = Log4r::Outputter.stderr
  $l.level = Log4r::WARN
  
  cfg = AppConfig.instance
  cfg.parse
  
  m = Manager.new(cfg)
  m.generate_output
end