Skip to content
Snippets Groups Projects
Commit 65bb83fd authored by Anusha Ranganathan's avatar Anusha Ranganathan
Browse files

Verify CRC1280 import task

parent 6841512e
No related branches found
No related tags found
1 merge request!352Verify CRC1280 import task
......@@ -321,6 +321,7 @@ class PrepareCsvFromCrcFolder
# fix incorrect spelling in import data
file_attributes['Modality'] = correct_spelling_of_modality(file_attributes['Modality'])
experiment_title = [file_attributes['Experiment title'], file_attributes['DublinCore-Title'], file_attributes['DataCite-Title']].find(&:present?)
experiment_description = [file_attributes['Experiment Description'], file_attributes['DublinCore-Description'], file_attributes['DataCite-Description']].find(&:present?)
[
......@@ -332,7 +333,7 @@ class PrepareCsvFromCrcFolder
[meta_info[:parent_work_source_identifier], meta_info[:parent_group_source_identifier]].compact.join(';'),
'',
'',
file_attributes['Experiment title'],
experiment_title,
file_attributes['Contributor'],
file_attributes['DataCite-contributorNameIdentifier'],
file_attributes['DataCite-contributorType'],
......
require 'json'
require 'csv'
class VerifyCRCDatasetImport
def self.verify_import(import_id)
importer = ::Bulkrax::Importer.find(import_id)
work_entries = importer.entries.select {|e| e.class.name == "Bulkrax::CrcFolderEntry"}
import_status = true
report_paths = []
work_entries.each do |work_entry|
verification_status, file_path = self.verify_experiment_and_report(import_id, work_entry.id)
import_status = import_status && verification_status
report_paths.append(file_path)
end
return import_status, report_paths
end
def self.verify_experiment_and_report(import_id, entry_id)
verification_status, message, paths_compared, analysis = self.verify_experiment(entry_id, import_id)
# prepare csv report
report_name = "analysis_report_#{import_id}_#{entry_id}_#{Time.now.strftime("%m-%d_%H-%M-%S")}.csv"
report_path = File.join(ENV.fetch('DOWNLOAD_PATH', '/shared/downloads'), report_name)
headers = %w(id, status, message)
CSV.open(report_path, "w") do |csv|
# Write headers
csv << headers
# Write verification status
csv << [JSON.pretty_generate(paths_compared), verification_status, message]
# Write per file status
if analysis
# only in input
analysis.fetch(:only_in_input, []).each do |i|
csv << [i, false, "Only in input"]
end
# Only in work
analysis.fetch(:only_in_work, []).each do |i|
csv << [i, false, "Only in work"]
end
# in_both - size_analysis
analysis[:in_both_size_analysis].each do |fn_hash|
csv << [fn_hash[:key], fn_hash[:result], fn_hash[:message]]
end
end
end
return verification_status, report_path
end
def self.verify_experiment(entry_id, import_id)
# gather paths compared
paths_compared = {
entry_id: entry_id,
import_id: import_id
}
# Find importer entry
begin
entry = ::Bulkrax::CrcFolderEntry.find(entry_id)
rescue
msg = "Error: Import entry #{entry_id} not found"
return false, msg, paths_compared, nil
end
paths_compared[source_identifier: entry.identifier]
# Check import id
if import_id != entry.importerexporter_id
msg = "Error: Importer id #{import_id} does not match id associated with entry #{entry_id}"
return false, msg, paths_compared, nil
end
# Get importer
begin
importer = ::Bulkrax::Importer.find(import_id)
rescue
msg = "Error: Importer #{import_id} associated with entry #{entry_id} not found"
return false, msg, paths_compared, nil
end
# Get base import dir from importer
import_base_dir = importer.parser_fields['import_file_path']
paths_compared[:import_base_dir] = import_base_dir
# Get import path
import_folder_name = entry.raw_metadata['folder_name']
# NOTE: This has the risk of getting the path wrong if the glob finds multiple directories
wildcard_import_path = File.join(import_base_dir, '*', import_folder_name)
paths_found = Dir.glob(wildcard_import_path)
if paths_found.empty?
msg = "The import directory #{wildcard_import_path} was not found"
return false, msg, paths_compared, nil
elsif paths_found.size > 1
msg = "The import directory #{wildcard_import_path} matched multiple"
return false, msg, paths_compared, nil
end
import_path = paths_found[0]
import_path = import_path + '/' unless import_path.end_with?('/')
paths_compared[:import_path] = import_path
# Check import path exists
unless Dir.exist?(import_path)
msg = "The import directory #{import_path} was not found"
return false, msg, paths_compared, nil
end
# Check import path is not empty
if Dir.empty?(import_path)
msg = ["The import directory #{import_path} is empty"]
return false, msg, paths_compared, nil
end
# Get CrcDataset for the entry
works = CrcDataset.where(source: entry.identifier)
unless works.first.present?
msg = "The CrcDataset with source identifier #{entry.identifier} was not found"
return false, msg, paths_compared, nil
end
paths_compared[:work_id] = works.first.id
# Get files to compare
# -- Get list of files from import directory
input_list = Dir.glob(File.join(import_path, '**', '*')).
reject {|fn| File.directory?(fn) }.
reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.
map{ |fn| fn.sub(import_path, '')}.sort
# ---- downcase folder name
sanitised_input_list = {}
input_list.each do |fp|
sanitised_input_list[fp] = fp
all_parts = fp.split('/')
if all_parts.size > 1
parts = all_parts[0...-1]
parts.map {|p| p.downcase!}
new_path = File.join(parts.join("/"), all_parts[-1])
sanitised_input_list[fp] = new_path
end
end
# ---- insert session folder
max_depth = 0
input_list.each do |fp|
parts = fp.split('/')
max_depth = parts.size if parts.size > max_depth
end
if max_depth == 3
sanitised_input_list.each do |fp, new_path|
parts = new_path.split('/')
if parts.size == 3
sanitised_input_list[fp] = File.join(parts[0], 'ses 01', parts[1], parts[2])
end
end
end
# -- Get list of objects from S3 for the CrcDataset
s3 = S3StorageService.new()
s3.init_client
bucket_id = s3.sanitise_name(works.first.id)
list_of_work_objects, _work_total_size = s3.list_all_objects(bucket_id)
list_of_works = list_of_work_objects.map{|a| a[:key]}.
reject{|fn| self.restricted_file_names?(fn.split('/')[-1])}.sort
# Analyse files
in_both = sanitised_input_list.values & list_of_works
only_in_input = sanitised_input_list.values - list_of_works
only_in_work = list_of_works - sanitised_input_list.values
# Analyse file size
size_analysis = []
sizes_match = true
in_both.each do |fn|
original_fn = sanitised_input_list.key(fn)
fp = File.join(import_path, original_fn)
input_size = File.size(fp)
selected_work = list_of_work_objects.find{|a| a[:key] == fn}
s3_size = selected_work.fetch(:size, 0)
if input_size == s3_size
message = "Size is the same - #{input_size}"
elsif input_size > s3_size
sizes_match = false
message = "Input file size #{input_size} is greater than object in S3 #{s3_size}"
else
sizes_match = false
message = "Input file size #{input_size} is smaller than object in S3 #{s3_size}"
end
size_analysis.append({
key: fn,
input_size: input_size,
s3_size: s3_size,
result: input_size == s3_size,
message: message
})
end
# Get verification status
verification_status = "Unknown"
if only_in_input.empty? and only_in_work.empty? and sizes_match
verification_status = true
end
if only_in_input.present? or only_in_work.present? or !sizes_match
verification_status = false
end
# gather the analysis
analysis = {
only_in_input: only_in_input,
only_in_work: only_in_work,
in_both_size_analysis: size_analysis
}
# return
return verification_status, "Comparison completed", paths_compared, analysis
end
def self.restricted_file_names?(file_name)
parsed_file = YAML.load_file(Rails.root.join('config', 'restricted_files.yml'))
parsed_file['restricted_file_names'].append('system_metadata.json')
parsed_file['restricted_file_names'].map(&:downcase).include?(file_name.downcase)
end
end
require 'json'
namespace :rdms do
namespace :crc_1280_import do
desc 'Verify CRC1280 import or verify CRC1280 import of an experiment. This will compare the files in the file path with the objects in the S3 bucket"]'
task :"verify_import", [:import_id] => :environment do |task, args|
puts "Verifying import #{args.import_id}"
import_status, report_paths = VerifyCRCDatasetImport.verify_import(args.import_id)
puts "import status : #{import_status}"
puts "Detailed reports are available at:"
puts JSON.pretty_generate(report_paths)
end
task :"verify_experiment", [:import_id, :entry_id] => :environment do |task, args|
puts "Verifying import #{args.import_id}"
experiment_status, report_path = VerifyCRCDatasetImport.verify_experiment_and_report(args.import_id, args.entry_id)
puts "Experiment import status : #{experiment_status}"
puts "Detailed reports is available at: #{report_path}"
end
end
end
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment