#!/usr/bin/env oo-ruby

#--
# Copyright 2012 Red Hat, Inc.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#    http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#++

require 'rubygems'
require 'getoptlong'
require 'time'

def usage
  puts <<USAGE
== Synopsis

#{$0}: Check all user applications

== Usage

#{$0} OPTIONS

Options:
-v|--verbose
    Print information about each check being performed
-l|--level [0, 1, 2]
    Level '0' is default, and checks to see if any gears are present in mongo but don't exist on the nodes and vice-versa.
    Level '1' performs extra checks such as integrity of consumed_gears count
    Level '2' additionally performs checks for application data integrity in mongo and checks for unused and unreserved gear UIDs 
-h|--help
    Show Usage info
USAGE
  exit 255
end

args = {}
begin
  opts = GetoptLong.new(
    ["--verbose",          "-v", GetoptLong::NO_ARGUMENT],
    ["--level",            "-l", GetoptLong::REQUIRED_ARGUMENT],
    ["--help",             "-h", GetoptLong::NO_ARGUMENT]
  )
  opts.each{ |k,v| args[k]=v }
rescue GetoptLong::Error => e
  usage
end

level = args["--level"].to_i || 0
verbose = args["--verbose"]
usage if args["--help"]

require "/var/www/openshift/broker/config/environment"
# Disable analytics for admin scripts
Rails.configuration.analytics[:enabled] = false
Rails.configuration.msg_broker[:rpc_options][:disctimeout] = 20
Rails.configuration.msg_broker[:rpc_options][:timeout] = 600

def datastore_has_gear?(gear_uuid, app_uuid=nil, false_positive_response=true)
  $false_positive_check_cnt += 1
  if $false_positive_check_cnt < FALSE_POSITIVE_CHECK_LIMIT
    # specifying the app_uuid first, if present
    # this makes better use of the indexes and performance is better 
    if app_uuid
      query = { "uuid" => app_uuid }
    else
      query = {}
    end
    query['group_instances.gears.uuid'] = gear_uuid
    return Application.where(query).exists?
  else
    return false_positive_response
  end
end

def datastore_has_gear_uid?(gear_uid, si_list, false_positive_response=true)
  $false_positive_check_cnt += 1
  if $false_positive_check_cnt < FALSE_POSITIVE_CHECK_LIMIT
    query = {"group_instances.gears" => {"$elemMatch" => {"uid" => gear_uid, "server_identity" => {"$in" => si_list}}}}
    return Application.where(query).exists?
  else
    return false_positive_response
  end
end

def district_has_available_uid?(district_id, gear_uid, false_positive_response=true)
  $false_positive_check_cnt += 1
  if $false_positive_check_cnt < FALSE_POSITIVE_CHECK_LIMIT
    query = { "_id" => district_id.to_s, "available_uids" => gear_uid }
    return District.where(query).exists?
  else
    return false_positive_response
  end
end

def get_user_info(user)
  user_ssh_keys = {}
  user["ssh_keys"].each { |k| user_ssh_keys[Digest::MD5.hexdigest(k["content"])] = k["name"] } if user["ssh_keys"]
  return {"consumed_gears" => user["consumed_gears"],
          "domains" => {},
          "login" => user["login"],
          "ssh_keys" => user_ssh_keys}
end

def check_consumed_gears(user_id)
  begin
    actual_gears = 0
    user = CloudUser.with(consistency: :strong).find_by(:_id => user_id)
    user.domains.each do |d|
      d.applications.each do |a|
        a.group_instances.each do |gi|
          actual_gears += gi.gears.length
        end
      end
    end
    return user.consumed_gears, actual_gears
  rescue Mongoid::Errors::DocumentNotFound
    puts "Error: User with ID #{user_id} not found in mongo"
    return 0, 0
  end
end

FALSE_POSITIVE_CHECK_LIMIT = 4000
$false_positive_check_cnt = 0
no_error = true
summary = []

datastore_hash = {}
user_hash = {}
domain_hash = {}
gear_uid_hash = {}
district_hash = {}
gear_sshkey_hash = {}

# store the current time for comparisons
current_time = Time.now

puts "Started at: #{current_time}"
start_time = (current_time.to_f * 1000).to_i

app_selection = {:fields => ["name",
                             "uuid",
                             "created_at",
                             "domain_id",
                             "group_instances.gears.uuid",
                             "group_instances.gears.uid",
                             "group_instances.gears.server_identity",
                             "group_instances._id",
                             "component_instances._id",
                             "component_instances.group_instance_id",
                             "app_ssh_keys.name", 
                             "app_ssh_keys.content"],
                 :timeout => false}

user_selection = {:fields => ["consumed_gears", "login", "ssh_keys.name", "ssh_keys.content"], 
                  :timeout => false}
user_selection_primary = user_selection.dup
user_selection_primary[:read] = :primary

domain_selection = {:fields => ["owner_id", "system_ssh_keys.name", "system_ssh_keys.content"], 
                    :timeout => false}

ret = []

if level >= 1
  OpenShift::DataStore.find(:cloud_users, {}, user_selection) do |user|
    user_hash[user["_id"].to_s] = get_user_info(user)
  end

  OpenShift::DataStore.find(:domains, {}, domain_selection) { |domain|
    owner_id = domain["owner_id"].to_s
    domain_hash[domain["_id"].to_s] = owner_id
    system_ssh_keys = {}
    domain["system_ssh_keys"].each { |k| system_ssh_keys[Digest::MD5.hexdigest(k["content"])] = k["name"] } if domain["system_ssh_keys"]
    
    if !user_hash[owner_id]
      OpenShift::DataStore.find(:cloud_users, {"_id" => BSON::ObjectId(owner_id)}, user_selection_primary) do |user|
        user_hash[user["_id"].to_s] = get_user_info(user) 
      end
    end

    if user_hash[owner_id]
      user_hash[owner_id]["domains"][domain["_id"].to_s] = 0
      user_hash[owner_id]["ssh_keys"].merge! system_ssh_keys
    else
      summary << "User '#{owner_id}' for domain '#{domain['_id']}' does not exist in mongo."
      no_error = false
    end
  } 
end

if level >= 2 and Rails.configuration.msg_broker[:districts][:enabled]
  OpenShift::DataStore.find(:districts, {}, {:timeout => false}) do |district|
    si_list =  district["server_identities"].map {|si| si["name"]}
    si_list.delete_if {|si| si.nil?}
    district_hash[district["_id"].to_s] = [ district["name"], district["max_capacity"], si_list, district["available_uids"] ] 
       
    # check available_uids list length and available_capacity
    if district["available_uids"].length != district["available_capacity"]
      summary << "District '#{district["name"]}' has (#{district["available_uids"].length}) available UIDs but (#{district["available_capacity"]}) available capacity"
      no_error = false
    end
  end
end

app_query = {"group_instances.gears.0" => {"$exists" => true}}
OpenShift::DataStore.find(:applications, app_query, app_selection) do |app|
    gear_count = 0
    owner_id = nil
    login = nil
    creation_time = app['created_at']
    domain_id = app['domain_id'].to_s
    app_ssh_keys = {}
    app['app_ssh_keys'].each { |k| app_ssh_keys[Digest::MD5.hexdigest(k["content"])] = k["name"] } if app['app_ssh_keys']
    
    if level >= 1
      owner_id = domain_hash[domain_id]

      if owner_id.nil?
        if (current_time - creation_time) > 600
          summary << "Application '#{app['name']}' does not have a domain '#{domain_id}' in mongo."
          no_error = false
        end
      elsif user_hash[owner_id].nil?
        if (current_time - creation_time) > 600
          summary << "Application '#{app['name']}' for domain '#{domain_id}' does not have a user '#{owner_id}' in mongo."
          no_error = false
        end
      else
        login = user_hash[owner_id]["login"]
        app_ssh_keys.merge! user_hash[owner_id]["ssh_keys"]
      end
            
      if (level >= 2) and ((current_time - creation_time) > 600)
        # check for component instances without corresponding group instances and vice-versa
        gi_hash = {}
        app["group_instances"].each do |gi|
          gi_hash[gi["_id"].to_s] = false
        end

        if app['component_instances']
          app['component_instances'].each do |ci|
            if ci['group_instance_id'] and gi_hash.has_key? ci['group_instance_id'].to_s
              gi_hash[ci['group_instance_id'].to_s] = true
            else
              summary << "Application '#{app['name']}' with Id '#{app['_id']}' has missing group instance for component instance '#{ci['_id']}'."
              no_error = false
            end
          end
          gi_hash.each do |gi_id, present|
            unless present
              summary << "Application '#{app['name']}' with Id '#{app['_id']}' has no components for group instance with Id '#{gi_id}'"
              no_error = false
            end
          end
        else
          summary << "Application '#{app['name']}' with Id '#{app['_id']}' doesn't have any component instances"
          no_error = false
        end
      end
    end
    app['group_instances'].each do |gi|
      gi['gears'].each do |gear|
        gear_count += 1
        datastore_hash[gear['uuid'].to_s] = [login, creation_time, gear['uid'], gear['server_identity'], app["uuid"], app_ssh_keys ]
        
        if level >= 2 and Rails.configuration.msg_broker[:districts][:enabled]
          # record all used uid values for each node to match later with the district
          gear_uid_hash[gear['server_identity']] = [] unless gear_uid_hash.has_key?(gear['server_identity'])
          gear_uid_hash[gear['server_identity']] << gear['uid'].to_i
        end
      end
    end
    user_hash[owner_id]["domains"][domain_id] += gear_count if level >= 1 and user_hash[owner_id]
end

total_time = (Time.now.to_f * 1000).to_i - start_time
puts "Time to fetch mongo data: #{total_time.to_f/1000}s"
puts "Total gears found in mongo: #{datastore_hash.length}"

if level >= 1
  user_hash.each do |owner_id, owner_hash|
    total_gears = 0
    owner_hash["domains"].each { |dom_id, domain_gear_count| total_gears += domain_gear_count }

    print "Checking consumed gear count for user #{owner_hash['login']}...\t" if verbose
    if owner_hash['consumed_gears'] != total_gears
      user_consumed_gears, app_actual_gears = check_consumed_gears(owner_id)
      if user_consumed_gears != app_actual_gears
        puts "FAIL" if verbose
        msg = "User #{owner_hash['login']} has a mismatch in consumed gears (#{user_consumed_gears}) and actual gears (#{app_actual_gears})"
        summary << msg
        no_error = false
      elsif verbose
        puts "OK"
      end
    elsif verbose
      puts "OK"
    end
  end
end

get_all_gears_start_time = (Time.now.to_f * 1000).to_i
node_hash, gear_sender_hash = OpenShift::ApplicationContainerProxy.get_all_gears
total_time = (Time.now.to_f * 1000).to_i - get_all_gears_start_time
puts "Time to get all gears from nodes: #{total_time.to_f/1000}s"
puts "Total gears found on the nodes: #{node_hash.length}"
puts "Total nodes that responded : #{gear_sender_hash.length}"

if level >= 2
  get_all_sshkeys_start_time = (Time.now.to_f * 1000).to_i
  gear_sshkey_hash, sshkeys_sender_list = OpenShift::ApplicationContainerProxy.get_all_gears_sshkeys
  total_time = (Time.now.to_f * 1000).to_i - get_all_sshkeys_start_time
  puts "Time to get all sshkeys for all gears from nodes: #{total_time.to_f/1000}s"
  puts "Total gears found on the nodes: #{gear_sshkey_hash.length}"
  puts "Total nodes that responded : #{sshkeys_sender_list.length}"
end


# now check
non_responding_nodes = {}
puts "Checking application gears and ssh keys on corresponding nodes:" if verbose
datastore_hash.each do |gear_uuid, gear_info|
  login = gear_info[0]
  creation_time = gear_info[1]
  server_identity = gear_info[3]
  app_uuid = gear_info[4]
  db_sshkeys = gear_info[5]
  print "#{gear_uuid} : #{gear_uuid.class}...\t" if verbose
  if (current_time - creation_time) > 600
    if not node_hash.has_key? gear_uuid
      if gear_sender_hash.has_key? server_identity
        if datastore_has_gear?(gear_uuid, app_uuid, false)
          no_error = false
          puts "FAIL" if verbose
          summary << "Gear #{gear_uuid} does not exist on any node"
        elsif verbose
          puts "OK"
        end
      else
        non_responding_nodes[server_identity] = 0 unless non_responding_nodes.has_key? server_identity
        non_responding_nodes[server_identity] += 1
        no_error = false
        puts "FAIL" if verbose
      end
    elsif server_identity != node_hash[gear_uuid][0]
      puts "FAIL" if verbose
      no_error = false
      summary << "Gear #{gear_uuid} exists but node '#{node_hash[gear_uuid][0]}' does not match DB server_identity '#{server_identity}'"
    elsif verbose
      puts "OK"
    end
  elsif verbose
    puts "OK"
  end

  if level >= 2
    print "Checking ssh keys for gear: #{gear_uuid}...\t" if verbose
    if (current_time - creation_time) > 600
      if gear_sshkey_hash.has_key? gear_uuid
        gear_sshkeys_list = gear_sshkey_hash[gear_uuid].keys.uniq.sort
        db_sshkeys_list = db_sshkeys.keys.uniq.sort
        if db_sshkeys_list == gear_sshkeys_list
          puts "OK" if verbose
        else
          no_error = false
          puts "FAIL" if verbose
          
          # calculate the common ssh keys in mongo and on the node
          common_sshkeys = gear_sshkeys_list & db_sshkeys_list

          # get the unmatched ssh keys for the gear from the node
          extra_gear_sshkeys = gear_sshkeys_list - common_sshkeys
          extra_gear_sshkeys.each do |key|
            summary << "Gear '#{gear_uuid}' has  key with comment '#{gear_sshkey_hash[gear_uuid][key]}' on the node but not in mongo."
          end
          
          # get the unmatched ssh keys for the gear in mongo
          extra_db_sshkeys = db_sshkeys_list - common_sshkeys
          extra_db_sshkeys.each do |key|
            summary << "Gear '#{gear_uuid}' has key with name '#{db_sshkeys[key]}' in mongo but not on the node."
          end
        end
      elsif verbose
        # the case where gear is not returned from mcollective will be handled by the earlier checks 
        puts "OK"
      end
    elsif verbose
      puts "OK"
    end
  end
end

# print error messages for any non-responding nodes
non_responding_nodes.each do |server_identity, gear_count|
  summary << "The node #{server_identity} expected to contain #{gear_count} gears wasn't returned from mcollective for the gear list"
end


# now check reverse
puts "Checking node gears in application database:" if verbose
node_hash.each do |gear_uuid, gear_info|
  print "#{gear_uuid}...\t" if verbose
  datastore_gear_info = datastore_hash[gear_uuid]
  if !datastore_gear_info
    if !datastore_has_gear?(gear_uuid, nil, true)
      no_error = false
      puts "FAIL" if verbose
      summary << "Gear #{gear_uuid} exists on node #{gear_info[0]} (uid: #{gear_info[1]}) but does not exist in mongo database"
    elsif verbose
      puts "OK"
    end
  else
    puts "OK" if verbose
    if !datastore_gear_info[2].nil?
      begin
        uid = gear_info[1]
        if uid != datastore_gear_info[2].to_i
          summary << "Gear #{gear_uuid} is using uid: '#{uid}' but has reserved uid: '#{datastore_gear_info[2].to_i}'"
          no_error = false
        end
      rescue Exception => e
        summary << "Failed to check gear: '#{gear_uuid}'s uid because of exception: #{e.message}"
        no_error = false
      end
    end
  end
end


if level >= 2
  # check for applications without any group instances in the database
  puts "Checking for application without any group instances in the database:" if verbose
  query = {"group_instances.0" => {"$exists" => false}}
  selection = {:fields => ["name", "created_at"], :timeout => false}
  OpenShift::DataStore.find(:applications, query, selection) do |app|
    print "Application #{app['name']}/#{app['_id']}...\t" if verbose
    if (current_time - app['created_at']) > 600
      puts "FAIL" if verbose
      no_error = false
      summary << "Application '#{app['name']}' with Id '#{app['_id']}' does not have any group instances"
    elsif verbose
      puts "OK"
    end
  end

  # check for applications without any gears in the group instance
  puts "Checking for application without any gears in the group instances in the database:" if verbose
  query = {"group_instances" => {"$elemMatch" => { "gears" => {"$size" => 0}}}}
  selection = {:fields => ["name", "group_instances._id", "group_instances.gears._id", "created_at"], :timeout => false}
  OpenShift::DataStore.find(:applications, query, selection) do |app|
    print "Application #{app['name']}/#{app['_id']}...\t" if verbose
    if (current_time - app['created_at']) > 600
      app['group_instances'].each do |gi|
        if (not gi.has_key?("gears")) or gi['gears'].length == 0 
          no_error = false
          puts "FAIL" if verbose
          summary << "Application '#{app['name']}' with Id '#{app['_id']}' does not have any gears within group instance '#{gi['_id']}'"
        end
      end
    elsif verbose
      puts "OK"
    end
  end

  # check for users with nil or empty or missing login in the database
  puts "Checking for users with nil or empty or missing login in the database:" if verbose
  query = {"$or" => [{"login" => {"$type" => 10}}, {"login" => ""}, {"login" => {"$exists" => false}}]}
  selection = {:fields => ["_id"], :timeout => false}
  OpenShift::DataStore.find(:cloud_users, query, selection) do |user|
    no_error = false
    summary << "User with Id #{user['_id']} has a null, empty, or missing login."
  end

  if Rails.configuration.msg_broker[:districts][:enabled]
    # check for any unreserved uid in the district
    # these are uids that gears are using but are still present in the district's available_uids
    puts "Checking for unreserved UIDs in the district:" if verbose
    gear_uid_hash.each do |server_identity, uid_list|
      district_hash.each do |district_id, district_info|
        if district_info[2].include?(server_identity)
          unreserved_uids = uid_list & district_info[3]
          unreserved_uids.each do |unreserved_uid|
            # re-checking unreserved UID in the database
            print "Re-checking UID #{unreserved_uid} in district #{district_info[0]} in the database...\t" if verbose
            if not datastore_has_gear_uid?(unreserved_uid, [server_identity], false)
              # the UID is no longer being used by any gear
              puts "OK" if verbose
            elsif not district_has_available_uid?(district_id, unreserved_uid, false)
              # the UID is available in the district
              puts "OK" if verbose
            else
              no_error = false
              puts "FAIL" if verbose
              summary << "UID '#{unreserved_uid}' is available in district '#{district_info[0]}' but used by a gear on node '#{server_identity}'"
            end
          end
          break
        end
      end
    end
  
    # check for any unused uid in the district
    # these are uids that are reserved in the district, but no gear is using
    puts "Checking for unused UIDs in the district:" if verbose 
    district_used_uids = []
    district_hash.each do |district_id, district_info|
      # collect gear uids from all nodes with server identities within this district
      district_info[2].each do |server_identity|
        district_used_uids |= (gear_uid_hash[server_identity] || [])
      end
   
      first_uuid = Rails.configuration.msg_broker[:districts][:first_uid]
      district_all_uids = []
      district_all_uids.fill(0, district_info[1]) {|i| first_uuid + i}
      district_unused_uids = district_all_uids - district_info[3] - district_used_uids 
    
      district_unused_uids.each do |unused_uid|
        # re-checking unused UID in the database
        print "Re-checking UID #{unused_uid} in district #{district_info[0]} in the database...\t" if verbose
        if datastore_has_gear_uid?(unused_uid, district_info[2], true)
          # found a gear that uses this UID
          puts "OK" if verbose
        elsif district_has_available_uid?(district_id, unused_uid, true)
          # the UID is no longer reserved in the district
          puts "OK" if verbose
        else
          no_error = false
          puts "FAIL" if verbose
          summary << "UID '#{unused_uid}' is reserved in district '#{district_info[0]}' but not used by any gear"
        end
      end
    end  
    
    # check to see if there are multiple gears with the same uid in the same district
    puts "Checking for gears with the same UID:" if verbose 
    district_hash.each do |district_id, district_info|
      # collect gear uids from all nodes with server identities within this district
      district_used_uids = []
      server_ids = []
      district_info[2].each do |server_identity|
        server_ids << server_identity
        district_used_uids = district_used_uids.concat(gear_uid_hash[server_identity] || [])
      end
      #get all the uids that appear more than once
      reused_uids = district_used_uids.select { |e| district_used_uids.count(e) > 1 }.uniq
      if reused_uids.length > 0
        no_error = false
        puts "FAIL" if verbose
        summary << "Below UIDs are used by multiple gears in district: '#{district_info[0]}'.  Please move one of them to another district."
        reused_uids.each do |uid|
          summary << "UID: #{uid} is used by gears:"
          datastore_hash.select{|k,v| v.include? uid and server_ids.any? {|server_id| v.include? server_id}}.each do |uuid, gear_info|
            summary << "\tGear:#{uuid} in #{gear_info[3]}"
          end
        end
      end
    end
  end
end


puts no_error ? "Success" : "Check failed.\n#{summary.join("\n")}"
if $false_positive_check_cnt >= FALSE_POSITIVE_CHECK_LIMIT
  puts "WARNING: Only checked the first #{FALSE_POSITIVE_CHECK_LIMIT} errors for false positives."
end

puts "Please refer to the oo-admin-repair tool to resolve some of these inconsistencies." unless no_error

total_time = (Time.now.to_f * 1000).to_i - start_time
puts "Total time: #{total_time.to_f/1000}s"
puts "Finished at: #{Time.now}"
exit (no_error ? 0 : 1)
