#! /usr/bin/perl # based on the script found at # http://www.clusterresources.com/products/torque/docs/2.6jobcheckpoint.shtml ################################################################################ # # Usage: checkpoint_script # # This script is invoked by pbs_mom to checkpoint a job. # ################################################################################ use strict; use Sys::Syslog; use File::Path; # Log levels: # 0 = none -- no logging # 1 = fail -- log only failures # 2 = info -- log invocations # 3 = debug -- log all subcommands my $logLevel = 3; logPrint(2, "Invoked: $0 " . join(' ', @ARGV) . "\n"); my ($sessionId, $jobId, $userId, $groupId, $signalNum, $checkpointDir, $checkpointName, $depth); my $usage = "Usage: $0 \n"; # Note that depth is not used in this script but could control a limit to the number of checkpoint # image files that are preserved on the disk. # # Note also that a request was made to identify whether this script was invoked by the job's # owner or by a system administrator. While this information is known to pbs_server, it # is not propagated to pbs_mom and thus it is not possible to pass this to the script. # Therefore, a workaround is to invoke qmgr and attempt to set a trivial variable. # This will fail if the invoker is not a manager. if (@ARGV == 8) { ($sessionId, $jobId, $userId, $groupId, $checkpointDir, $checkpointName, $signalNum, $depth) = @ARGV; } else { logDie(1, $usage); } # Drop privileges to the job owner my $gid = getgrnam($groupId); logDie(1, "Unable to resolve group id ($groupId)\n") unless defined $gid; $( = $gid; $) = $gid; logDie(1, "Unable to set gid: $gid") unless $gid == $(; my $uid = getpwnam($userId); logDie(1, "Unable to resolve user id ($userId)\n") unless defined $uid; $< = $uid; $> = $uid; logDie(1, "Unable to set uid: $uid") unless $uid == $<; # Change to the checkpoint directory where we want the checkpoint to be created chdir $checkpointDir or logDie(1, "Unable to cd to checkpoint dir ($checkpointDir): $!\n") if $logLevel; # set $HOME so ompi finds the config file $ENV{HOME}=(getpwnam($userId))[7]; my $ortpid=`pgrep -g $sessionId orterun`; chomp($ortpid); my $cmd; if($ortpid) { $cmd = "ompi-checkpoint"; $cmd .= " --term" if($signalNum == 15); $cmd .= " $ortpid"; } else { $cmd = "cr_checkpoint"; $cmd .= " --signal $signalNum" if $signalNum; $cmd .= " --tree $sessionId"; $cmd .= " --file $checkpointDir/$checkpointName"; } # untaint $cmd $cmd =~ /(.*)/; $cmd = $1; my $output = `$cmd 2>&1`; my $rc = $? >> 8; if($ortpid) { # store the snapshot reference in the spool directory my $snap_ref_dir = $ENV{HOME} . "/.qlu/torque"; my $ref_file = $snap_ref_dir . "/jobid2ompi_snap_ref"; my $nodelist; if(! -d $snap_ref_dir) { eval { mkpath($snap_ref_dir); }; if($@) { logDie(1,"Cannot create directory \"$snap_ref_dir\": $@"); } } my @outlist = split(/[ \n]/,$output); my $ref = $outlist[$#outlist]; $nodelist = getNodelist("/var/spool/pbs/aux/" . $jobId); addJobRef($jobId, $ref, $nodelist, $ref_file); } logDie(1, "Subcommand ($cmd) failed with rc=$rc:\n$output") if $rc && $logLevel >= 1; logPrint(3, "Subcommand ($cmd) yielded rc=$rc:\n$output") if $logLevel >= 3; exit 0; ############################################################################ # # converts a NODEFILE to a nodelist (<#nodes>*<#cpus>+<#nodes>*<#cpus>...) # ############################################################################ sub getNodelist { my($nodefile) = @_; my @nodelist; if(! -f $nodefile) { return ""; } @nodelist = `sort $nodefile | uniq -c | awk '{print \$1}' | sort -n | \ uniq -c | awk '{print \$1 "*" \$2}'`; chomp(@nodelist); return join("+",@nodelist); } ############################################################################ # # makes sure that $jobid $snap_ref is added to the file # ############################################################################ sub addJobRef { my($jobid, $ref, $nodelist, $ref_file) = @_; my($saved_ref, $fd); $saved_ref = getLatestRef($jobid, $ref_file); if($saved_ref eq $ref) { # nothing to do return 1; } $fd = open(SNAP_REF,">>",$ref_file) or logDie(1,"cannot open $ref_file: $!"); print SNAP_REF "$jobId $ref $nodelist\n"; close($fd); } ############################################################################ # # return the latest snapshot reference with given jobid from given file # ############################################################################ sub getLatestRef { my($jobid, $ref_file) = @_; my($fd, $line, $ref); # return nothing if file does not exist if(! -f $ref_file) { return ""; } $fd = open(SNAP_REF,"<",$ref_file) or logDie(1, "cannot open $ref_file: $!"); while(defined($line = )) { $line =~ /([^\s]*)\s+([^\s]*)/; if($1 eq $jobid) { $ref = $2; } } close($fd); chomp($ref); return $ref; } ################################################################################ # logPrint($message) # Write a message (to syslog) and die ################################################################################ sub logPrint { my ($level, $message) = @_; my @severity = ('none', 'warning', 'info', 'debug'); return if $level > $logLevel; openlog('checkpoint_script', '', 'user'); syslog($severity[$level], $message); closelog(); } ################################################################################ # logDie($message) # Write a message (to syslog) and die ################################################################################ sub logDie { my ($level, $message) = @_; logPrint($level, $message); die($message); }