#! /usr/bin/env python # ______________________________________________________________________ # # Filename: sam_samples # # Created: 6-NOV-2001, Harry Melanson # # ______________________________________________________________________ """Summarize available SAM data samples. USAGE: ----- sam_samples [options] --version version REQUIRED USER CHOICES: --------------------- --version version -- Production version OPTIONS: ------- SAM options: --data_tier tier -- Data tier [default = reconstructed] --run_type type -- Run type [default = ] --run_number run -- Run number [default = ] (Currently disabled.) --file_name file -- File name [default = ] SAM short-cuts: --data -- For real data (same as --run_type physics) --mc -- For Monte Carlo (same as --run_type monte) --gen -- Generated files (same as --data_tier generated) --d0g -- D0GSTAR files (same as --data_tier simulated) --sim -- d0sim files (same as --data_tier digitized) --reco -- Reconstructed files (same as --data_tier reconstructed) --root -- Root files (same as --data_tier root) Special options for this script: --level x -- Print level [default = 0] The larger the number, the more details are printed. -h -- Print help text and exit. --help -- Print help text and exit. DESCRIPTION: ----------- sam_samples gives a summary of event samples currently available in SAM. It is a 'thin layer' on top of the more general 'sam translate constraints' command, which is part of the SAM system. This script is designed to summarize any currently available event samples, based on some general idea of what 'standard' DZERO users might be interested in. A typical usage would be to ask what reconstructed real data samples are available from a particular version of RECO. As an example, to determine how many files / events have been reconstructed with p10.07.01, issue the command: sam_samples --version p10.07.01 --data For more details about this event sample (like a summary of the number of files per run), you can try increasing the levels of print out, e.g. sam_samples --version p10.07.01 --data --level 1 To determine what reco_analyze files are available, try something like sam_samples --version p10.07.01 --data --root --level 1 This command also supports Monte Carlo samples. For example, to find out what reconstructed Monte Carlo samples are available with p10.08.01, try sam_samples --version p10.08.01 --mc --level 1 You can select various levels of production by using one of the following 'short cuts': --gen - Generated files --d0g - D0GSTAR files --sim - d0sim files --reco - reco files --root - reco_analyze root tuples For data samples, the reports are based on run numbers. For Monte Carlo samples, an attempt is made to summarize the reports based on generated signal channels. File names are used to determine which physics signals were generated. The algorithm used to extract such information is evolving, and may not accurately represent all generated samples. REQUIREMENTS: ------------ Users must set up sam, either directly, or via a setup D0RunII command. Also, users must setup the d0tools product. LIMITATIONS: ----------- The determination of generated Monte Carlo signal sample depends on the details of the file naming convention. It is not foolproof. """ # ______________________________________________________________________ import sys, os import string import scriptutil import dbFileClient # ______________________________________________________________________ __author__ = 'Harry Melanson, melanson@fnal.gov' __version__ = '01.00.00' __filename__ = 'sam_samples' __doc__ = __doc__ % vars() # ______________________________________________________________________ class Options: def __init__(self): self.debug = 0 self.version = '' self.data_tier = 'reconstructed%' self.run_type = '%' self.run_number = '%' self.file_name = '%' self.print_level = 0 def __repr__(self): s = [] s.append('Production version: %s' % self.version) s.append('Data tier: %s' % self.data_tier) s.append('Run type: %s' % self.run_type) if self.run_number != '%': s.append('Run number: %s' % self.run_type) if self.file_name != '%': s.append('File name: %s' % self.file_name) return string.join(s,'\n') # ______________________________________________________________________ class SamFile: def __init__(self, name): self.name = name self.source = DataType(self.name) self.run_number = RunNumber(self.name) self.partitions = Partitions(self.name) self.sample = DataSample(self.name) self.samInfo = {} def __repr__(self): s = [] s.append(self.name) s.append(self.source) s.append(('%i' % self.run_number)) s.append(self.partitions) s.append(self.sample) return string.join(s, ' ') # ______________________________________________________________________ class SamQueryResult: def __init__(self, options): self.raw = [] self.file_count = 0 self.file_size = 0 self.total_size = 0 self.event_count = 0 self.files = [] self.dbFileServer = dbFileClient.getDbServer() self.options = options def __repr__(self): s = [] s.append('File count: %s' % self.file_count) s.append('Average file size: %s' % self.file_size) s.append('Total file size: %s' % self.total_size) s.append('Event count: %s' % self.event_count) for file in self.files: s.append('%s' % file) return string.join(s,'\n') def parse(self): if self.options.print_level >= 2: print ('Querying database for details about these files (%i files)...' % len(self.raw)) count = 0 for line in self.raw: if string.find(line, 'File Count:') != -1: self.file_count = string.atoi(string.split(line)[-1]) elif string.find(line, 'Average File Size:') != -1: self.file_size = string.atoi(string.split(line)[-1]) elif string.find(line, 'Total File Size:') != -1: self.total_size = string.atoi(string.split(line)[-1]) elif string.find(line, 'Total Event Count:') != -1: self.event_count = string.atoi(string.split(line)[-1]) elif string.find(line, 'Files:') != -1: pass else: if len(string.strip(line)) != 0: filename = string.strip(line) samfile = SamFile(filename) if self.options.print_level >= 2: count = count + 1 samfile.samInfo = self.dbFileServer.getFileInfo(filename) print count self.files.append(samfile) # ______________________________________________________________________ def DataType(filename): if string.find(filename,'.raw') != -1: return 'data' else: return 'mc' def DataSample(filename): if DataType(filename) == 'data': tokens1 = string.split(filename, 'reco_') tokens2 = string.split(tokens1[-1], '_') return tokens2[0] else: # ________________________________________ # # Check whether file is from a known generator gen = 'unknown-generator' generators = ['pythia', 'isajet', 'herwig', 'single'] for type in generators: if string.find(filename, type) != -1: gen = '_' + type + '_' if gen == 'unknown-generator': return gen # ________________________________________ # # Separate fields around generator field tokens1 = string.split(filename, gen) # ________________________________________ # # Get the number of minbias events overlaid mb = 'unknown-mb' minbias = ['mb-none', 'mb-poisson', 'mb-fixed'] for type in minbias: if string.find(filename, type) != -1: mb = '_' + type if mb == 'unknown-mb': return mb # ________________________________________ # # Separate fields around minbias field tokens2 = string.split(tokens1[-1], mb) answer = tokens2[0] tokens = string.split(answer, '-') for i in range(0, len(tokens)): if string.find(tokens[i], 'Skip') != -1: tokens[i] = '' answer = string.join(tokens,'-') answer = string.replace(answer, '--', '-') return answer def RunNumber(filename): if DataType(filename) == 'data': tokens1 = string.split(filename, '.raw') tokens2 = string.split(tokens1[0], '_') try: if string.find(filename, '_mrg_') == -1: run = string.atoi(tokens2[-2]) # Reco output else: run = string.atoi(tokens2[-3]) # Merged reco_analyze output except: run = -1 return run else: return 0 def Partitions(filename): if DataType(filename) == 'data': tokens1 = string.split(filename, '_all_') tokens2 = string.split(tokens1[-1], '_') if tokens2[1] != 'mrg': return string.split(tokens2[1],'.')[0] else: return string.split(tokens2[2],'.')[0] else: return '' # ______________________________________________________________________ # # Return result from sam translate constraints command def SamTranslateConstraints(options): command = string.join(['sam translate constraints', "--dim='", 'version', options.version, 'and', 'data_tier', options.data_tier, 'and', 'run_type', options.run_type, 'and', # 'run_number', options.run_number, 'and', 'file_name', options.file_name, "'"], ' ') if options.debug == 1: print command print "Querying database to get associated files..." result = SamQueryResult(options) result.raw = os.popen(command,'r').readlines() if options.debug == 1: for line in result.raw: print line[:-1] result.parse() if options.debug == 1: print result return result # ______________________________________________________________________ # # Print summary of results def PrintSummary(options, results): print print 'SAM samples currently available' print '-------------------------------' print print options print print ('File count: %s' % results.file_count) print ('Event count: %s' % results.event_count) print ('Average file size: %s' % results.file_size) print ('Total file size: %s' % results.total_size) print if options.print_level <= 0: return summary = {} events = {} for file in results.files: if file.source == 'data': key = file.run_number else: key = file.sample if summary.has_key(key): summary[key].append(file) events[key] = events[key] + file.samInfo.eventCount else: summary[key] = [] summary[key].append(file) events[key] = file.samInfo.eventCount keys = summary.keys() keys.sort() for key in keys: if summary[key][0].source == 'data': print ('Run: %s, Files: %3i, Events: %7i' % (key, len(summary[key]), events[key])) else: print ('Channel: %s, Files: %3i, Events: %7i' % (key, len(summary[key]), events[key])) if options.print_level >= 3: for file in summary[key]: print (' Events: %5i, %s' % (file.samInfo.eventCount, file.name)) # ______________________________________________________________________ # # Return command line options def GetUserOptions(): userOptions = Options() supportedOptions = ['version=', 'data_tier=', 'run_type=', 'run_number=', 'file_name=', 'data', 'mc', 'gen', 'digi', 'sim', 'reco', 'root', 'level=', 'debug' ] optlist, args = scriptutil.getOptions('', supportedOptions, __doc__) for opt in optlist: # Standard SAM dimensions if opt.flag == '--version': userOptions.version = opt.value elif opt.flag == '--data_tier': userOptions.data_tier = opt.value elif opt.flag == '--run_type': userOptions.run_type = opt.value elif opt.flag == '--run_number': userOptions.run_number = opt.value elif opt.flag == '--file_name': userOptions.file_name = opt.value # Short cuts if opt.flag == '--data': userOptions.run_type = 'physics%' elif opt.flag == '--mc': userOptions.run_type = 'monte%' elif opt.flag == '--gen': userOptions.data_tier = 'generated%' elif opt.flag == '--d0g': userOptions.data_tier = 'simulated%' elif opt.flag == '--sim': userOptions.data_tier = 'digitized%' elif opt.flag == '--reco': userOptions.data_tier = 'reconstructed%' elif opt.flag == '--root': userOptions.data_tier = 'root%' # Internal options if opt.flag == '--debug': userOptions.debug = 1 elif opt.flag == '--level': userOptions.print_level = string.atoi(opt.value) if userOptions.version == '': scriptutil.FatalError("You must specify a production version with the --version option.", "Use the --help option to get a description on how to use this command.") return userOptions # ______________________________________________________________________ def process_command(): """Process the command.""" options = GetUserOptions() result = SamTranslateConstraints(options) PrintSummary(options, result) # ______________________________________________________________________ # # Run command if __name__ == '__main__': process_command() sys.exit(0)