03/18/2011 10:57:22;0080; pbs_mom;Req;dis_request_read;decoding command HoldJob from PBS_Server 03/18/2011 10:57:22;0100; pbs_mom;Req;;Type HoldJob request received from PBS_Server@cluster.local, sock=10 03/18/2011 10:57:22;0008; pbs_mom;Job;process_request;request type HoldJob from host cluster.local received 03/18/2011 10:57:22;0008; pbs_mom;Job;process_request;request type HoldJob from host cluster.local allowed 03/18/2011 10:57:22;0008; pbs_mom;Job;dispatch_request;dispatching request HoldJob on sd=10 03/18/2011 10:57:22;0002; pbs_mom;n/a;mom_close_poll;entered 03/18/2011 10:57:22;0080; pbs_mom;Req;dis_request_read;decoding command Disconnect from PBS_Server 03/18/2011 10:57:22;0008; pbs_mom;Job;scan_for_terminated;entered 03/18/2011 10:57:22;0080; pbs_mom;Svr;mom_get_sample;proc_array load started 03/18/2011 10:57:22;0080; pbs_mom;n/a;mom_get_sample;proc_array loaded - nproc=88 03/18/2011 10:57:22;0080; pbs_mom;n/a;cput_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:22;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3156 cputime=0 (cputfactor=1.000000) 03/18/2011 10:57:22;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3173 cputime=0 (cputfactor=1.000000) 03/18/2011 10:57:22;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3174 cputime=10 (cputfactor=1.000000) 03/18/2011 10:57:22;0080; pbs_mom;n/a;mem_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:22;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3156 vsize=8912896 sum=8912896 03/18/2011 10:57:22;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3173 vsize=8916992 sum=17829888 03/18/2011 10:57:22;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3174 vsize=3735552 sum=21565440 03/18/2011 10:57:22;0080; pbs_mom;n/a;resi_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:22;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3156 rss=1003520 sum=1003520 03/18/2011 10:57:22;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3173 rss=1028096 sum=2031616 03/18/2011 10:57:22;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3174 rss=294912 sum=2326528 03/18/2011 10:57:22;0080; pbs_mom;Job;96.cluster.pac;checking job w/subtask pid=3175 (child pid=3175) 03/18/2011 10:57:22;0080; pbs_mom;Job;96.cluster.pac;found match with job subtask for pid=3175 03/18/2011 10:57:22;0080; pbs_mom;Svr;scan_for_exiting;searching for exiting jobs 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_all_update_stat;composing status update for server 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;sessions[0]: pid 3156 sid 3156 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;sessions[1]: pid 3173 sid 3156 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;sessions[1]: pid 3174 sid 3156 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;nsessions=1 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;sessions[0]: pid 3156 sid 3156 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;sessions[1]: pid 3173 sid 3156 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;sessions[1]: pid 3174 sid 3156 03/18/2011 10:57:25;0002; pbs_mom;n/a;sessions;nsessions=1 03/18/2011 10:57:25;0002; pbs_mom;n/a;nusers;nusers[0]: pid 3156 uid 501 03/18/2011 10:57:25;0002; pbs_mom;n/a;nusers;nusers[1]: pid 3173 uid 501 03/18/2011 10:57:25;0002; pbs_mom;n/a;nusers;nusers[1]: pid 3174 uid 501 03/18/2011 10:57:25;0002; pbs_mom;n/a;nusers;nusers=1 03/18/2011 10:57:25;0002; pbs_mom;n/a;totmem;totmem: total mem=18864599040 03/18/2011 10:57:25;0002; pbs_mom;n/a;availmem;availmem: free mem=20328415232 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "opsys=linux" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "uname=Linux compute-0-1.local 2.6.18-194.17.4.el5 #1 SMP Mon Oct 25 15:50:53 EDT 2010 x86_64" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "sessions=3156" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "nsessions=1" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "nusers=1" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "idletime=212" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "totmem=18422460kb" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "availmem=19851968kb" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "physmem=1026824kb" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "ncpus=2" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "loadave=0.49" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "gres=" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "netload=210644" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "state=free" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "jobs=96.cluster.pac" 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;mom_server_update_stat: sending to server "varattr= " 03/18/2011 10:57:25;0002; pbs_mom;n/a;mom_server_update_stat;status update successfully sent to cluster.local 03/18/2011 10:57:25;0080; pbs_mom;Svr;mom_get_sample;proc_array load started 03/18/2011 10:57:25;0080; pbs_mom;n/a;mom_get_sample;proc_array loaded - nproc=87 03/18/2011 10:57:25;0080; pbs_mom;n/a;cput_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:25;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3156 cputime=0 (cputfactor=1.000000) 03/18/2011 10:57:25;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3173 cputime=0 (cputfactor=1.000000) 03/18/2011 10:57:25;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3174 cputime=13 (cputfactor=1.000000) 03/18/2011 10:57:25;0080; pbs_mom;n/a;mem_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:25;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3156 vsize=8912896 sum=8912896 03/18/2011 10:57:25;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3173 vsize=8916992 sum=17829888 03/18/2011 10:57:25;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3174 vsize=3735552 sum=21565440 03/18/2011 10:57:25;0080; pbs_mom;n/a;resi_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:25;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3156 rss=1003520 sum=1003520 03/18/2011 10:57:25;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3173 rss=1028096 sum=2031616 03/18/2011 10:57:25;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3174 rss=294912 sum=2326528 03/18/2011 10:57:25;0008; pbs_mom;Req;send_sisters;sending command POLL_JOB for job 96.cluster.pac (7) 03/18/2011 10:57:27;0008; pbs_mom;Job;scan_for_terminated;entered 03/18/2011 10:57:27;0080; pbs_mom;Svr;mom_get_sample;proc_array load started 03/18/2011 10:57:27;0080; pbs_mom;n/a;mom_get_sample;proc_array loaded - nproc=85 03/18/2011 10:57:27;0080; pbs_mom;n/a;cput_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:27;0002; pbs_mom;n/a;cput_sum;cput_sum: session=3156 pid=3156 cputime=15 (cputfactor=1.000000) 03/18/2011 10:57:27;0080; pbs_mom;n/a;mem_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:27;0002; pbs_mom;n/a;mem_sum;mem_sum: session=3156 pid=3156 vsize=0 sum=0 03/18/2011 10:57:27;0080; pbs_mom;n/a;resi_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:27;0002; pbs_mom;n/a;resi_sum;resi_sum: session=3156 pid=3156 rss=0 sum=0 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;checking job w/subtask pid=0 (child pid=3156) 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;found match with job task 0 for pid=3156 03/18/2011 10:57:27;0008; pbs_mom;Job;scan_for_terminated;pid 3156 harvested for job 96.cluster.pac, task 1, exitcode=96 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;kill_task: sending signal 9 to task 1, session 3156 03/18/2011 10:57:27;0008; pbs_mom;Svr;task_save;saving task in /opt/torque/mom_priv/jobs/96.cluster.pac.TK 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;kill_task: could not send signal 9 to task 1 (session 3156)--no process was found with this session ID (marking task as killed)! 03/18/2011 10:57:27;0008; pbs_mom;Svr;task_save;saving task in /opt/torque/mom_priv/jobs/96.cluster.pac.TK 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;scan_for_terminated: job 96.cluster.pac task 1 terminated, sid=3156 03/18/2011 10:57:27;0080; pbs_mom;Svr;scan_for_exiting;searching for exiting jobs 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;job was terminated 03/18/2011 10:57:27;0008; pbs_mom;Req;send_sisters;sending command KILL_JOB for job 96.cluster.pac (2) 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;no sisters contacted - setting job substate to EXITING 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;task is dead 03/18/2011 10:57:27;0008; pbs_mom;Svr;task_save;saving task in /opt/torque/mom_priv/jobs/96.cluster.pac.TK 03/18/2011 10:57:27;0008; pbs_mom;Job;kill_job;scan_for_exiting: sending signal 9, "KILL" to job 96.cluster.pac, reason: local task termination detected 03/18/2011 10:57:27;0002; pbs_mom;n/a;run_pelog;userepilog script '/opt/torque/mom_priv/epilogue.precancel' for job 96.cluster.pac does not exist (cwd: /opt/torque/mom_priv,pid: 2957) 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;kill_job done (killed 0 processes) 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;sending preobit jobstat 03/18/2011 10:57:27;0080; pbs_mom;Svr;preobit_reply;top of preobit_reply 03/18/2011 10:57:27;0080; pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop 03/18/2011 10:57:27;0080; pbs_mom;Svr;preobit_reply;in while loop, no error from job stat 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;performing job clean-up in preobit_reply() 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;epilog subtask created with pid 3179 - substate set to JOB_SUBSTATE_OBIT - registered post_epilogue 03/18/2011 10:57:27;0002; pbs_mom;n/a;mom_close_poll;entered 03/18/2011 10:57:27;0008; pbs_mom;Job;scan_for_terminated;entered 03/18/2011 10:57:27;0080; pbs_mom;Svr;mom_get_sample;proc_array load started 03/18/2011 10:57:27;0080; pbs_mom;n/a;mom_get_sample;proc_array loaded - nproc=85 03/18/2011 10:57:27;0080; pbs_mom;n/a;cput_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:27;0080; pbs_mom;n/a;mem_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:27;0080; pbs_mom;n/a;resi_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;checking job w/subtask pid=3179 (child pid=3179) 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;found match with job subtask for pid=3179 03/18/2011 10:57:27;0080; pbs_mom;Req;post_epilogue;preparing obit message for job 96.cluster.pac 03/18/2011 10:57:27;0080; pbs_mom;Job;96.cluster.pac;obit sent to server 03/18/2011 10:57:27;0001; pbs_mom;Job;96.cluster.pac;job obit acknowledge received - substate set to JOB_SUBSTATE_EXITED 03/18/2011 10:57:27;0080; pbs_mom;Req;dis_request_read;decoding command CopyFiles from PBS_Server 03/18/2011 10:57:27;0100; pbs_mom;Req;;Type CopyFiles request received from PBS_Server@cluster.local, sock=10 03/18/2011 10:57:27;0008; pbs_mom;Job;process_request;request type CopyFiles from host cluster.local received 03/18/2011 10:57:27;0008; pbs_mom;Job;process_request;request type CopyFiles from host cluster.local allowed 03/18/2011 10:57:27;0008; pbs_mom;Job;dispatch_request;dispatching request CopyFiles on sd=10 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;attempting to copy file 'cluster.pac:/home/ublcr/test.sh.o96' 03/18/2011 10:57:27;0008; pbs_mom;Job;96.cluster.pac;forking to user, uid: 501 gid: 501 homedir: '/home/ublcr' 03/18/2011 10:57:27;0002; pbs_mom;n/a;mom_close_poll;entered 03/18/2011 10:57:28;0008; pbs_mom;Job;scan_for_terminated;entered 03/18/2011 10:57:28;0080; pbs_mom;Svr;mom_get_sample;proc_array load started 03/18/2011 10:57:28;0080; pbs_mom;n/a;mom_get_sample;proc_array loaded - nproc=85 03/18/2011 10:57:28;0080; pbs_mom;n/a;cput_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:28;0080; pbs_mom;n/a;mem_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:28;0080; pbs_mom;n/a;resi_sum;proc_array loop start - jobid = 96.cluster.pac 03/18/2011 10:57:28;0080; pbs_mom;Job;96.cluster.pac;checking job w/subtask pid=0 (child pid=3180) 03/18/2011 10:57:28;0008; pbs_mom;Job;scan_for_terminated;pid 3180 not tracked, statloc=0, exitval=0