EMAILS: PBS JOB ID:76.lancelot-laptop [lancelot@cfa ~]$ cat job3.pbs #!/bin/bash #PBS -N job3 #PBS -o job3.log #PBS -e job3.err #PBS -q sai #PBA -I #PBS -l nodes=2:ppn=2 #PBS -l walltime=24:00:00 #PBS -l cput=1:00:00 #PBS -V cd /home/lancelot echo running on hosts `hostname` echo time is `date` echo directory is $PWD echo job runs on the nodes: cat $PBS_NODEFILE NPROCS=`wc -l < $PBS_NODEFILE` echo this job has allocated $NPROCS nodes mpiexec -np 4 ./prog [lancelot@cfa ~]$ cat prog #!/bin/bash echo 999999999|./icpi root@lancelot-laptop:/home/lancelot# pbsnodes lancelot-laptop state = free np = 2 ntype = cluster jobs = 0/76.lancelot-laptop status = rectime=1343122703,varattr=,jobs=76.lancelot-laptop,state=free,netload=95261305,gres=,loadave=0.57,ncpus=2,physmem=1542608kb,availmem=2981784kb,totmem=3494344kb,idletime=14158,nusers=2,nsessions=13,sessions=1100 792 1309 1349 1365 1374 1384 1439 1452 1682 1749 1798 2737,uname=Linux lancelot-laptop 2.6.32-41-generic #94-Ubuntu SMP Fri Jul 6 16:51:39 UTC 2012 i686,opsys=linux mom_service_port = 15002 mom_manager_port = 15003 gpus = 0 cfa state = free np = 12 ntype = cluster jobs = 0/76.lancelot-laptop status = rectime=1343122703,varattr=,jobs=76.lancelot-laptop,state=free,netload=492745850,gres=,loadave=0.00,ncpus=12,physmem=8015456kb,availmem=22517440kb,totmem=24399448kb,idletime=2992,nusers=5,nsessions=58,sessions=18335 469 27670 752 18344 834 1171 1982 2226 3403 2290 14058 14160 14359 14579 15144 15464 15698 15913 16121 16201 16444 16988 17058 17603 18048 18278 18378 18379 18405 18411 18479 18557 18884 19096 22028 22149 22256 22257 22283 22290 22347 27347 27515 27561 30703 30712 30795 30797 30823 30829 30905 32454 32458 32459 32467 32469 32489,uname=Linux cfa 2.6.32-220.el6.x86_64 #1 SMP Tue Dec 6 19:48:22 GMT 2011 x86_64,opsys=linux mom_service_port = 15002 mom_manager_port = 15003 gpus = 0 root@lancelot-laptop:/home/lancelot# tracejob 76 Job: 76.lancelot-laptop 07/24/2012 15:01:03 M JOIN JOB as node 1 07/24/2012 15:01:03 S enqueuing into sai, state 1 hop 1 07/24/2012 15:01:03 S Job Queued at request of lancelot@cfa, owner = lancelot@cfa, job name = job3, queue = sai 07/24/2012 15:01:03 S Job Modified at request of Scheduler@lancelot-laptop 07/24/2012 15:01:03 L Job Run 07/24/2012 15:01:03 S Job Run at request of Scheduler@lancelot-laptop 07/24/2012 15:01:03 A queue=sai 07/24/2012 15:01:03 A user=lancelot group=lancelot jobname=job3 queue=sai ctime=1343113263 qtime=1343113263 etime=1343113263 start=1343113263 owner=lancelot@cfa exec_host=cfa/0+lancelot-laptop/0 Resource_List.cput=01:00:00 Resource_List.neednodes=2 Resource_List.nodect=2 Resource_List.nodes=2 Resource_List.walltime=24:00:00 07/24/2012 15:01:57 S Not sending email: User does not want mail of this type. root@lancelot-laptop:/home/lancelot# tracejob 77 Job: 77.lancelot-laptop 07/24/2012 15:13:11 S enqueuing into sai, state 1 hop 1 07/24/2012 15:13:11 S Job Queued at request of lancelot@cfa, owner = lancelot@cfa, job name = job4, queue = sai 07/24/2012 15:13:11 S Job Modified at request of Scheduler@lancelot-laptop 07/24/2012 15:13:11 L Job Run 07/24/2012 15:13:11 S Job Run at request of Scheduler@lancelot-laptop 07/24/2012 15:13:11 S Not sending email: User does not want mail of this type. 07/24/2012 15:13:11 A queue=sai 07/24/2012 15:13:11 A user=lancelot group=lancelot jobname=job4 queue=sai ctime=1343113991 qtime=1343113991 etime=1343113991 start=1343113991 owner=lancelot@cfa exec_host=lancelot-laptop/1 Resource_List.cput=01:00:00 Resource_List.walltime=24:00:00 07/24/2012 15:13:56 S Not sending email: User does not want mail of this type. 07/24/2012 15:13:56 S Exit_status=0 resources_used.cput=00:00:45 resources_used.mem=5300kb resources_used.vmem=19680kb resources_used.walltime=00:00:45 07/24/2012 15:13:56 M scan_for_terminated: job 77.lancelot-laptop task 1 terminated, sid=4008 07/24/2012 15:13:56 M job was terminated 07/24/2012 15:13:56 M obit sent to server 07/24/2012 15:13:56 A user=lancelot group=lancelot jobname=job4 queue=sai ctime=1343113991 qtime=1343113991 etime=1343113991 start=1343113991 owner=lancelot@cfa exec_host=lancelot-laptop/1 Resource_List.cput=01:00:00 Resource_List.walltime=24:00:00 session=4008 end=1343114036 Exit_status=0 resources_used.cput=00:00:45 resources_used.mem=5300kb resources_used.vmem=19680kb resources_used.walltime=00:00:45 07/24/2012 15:13:57 M removed job script 07/24/2012 15:18:57 S dequeuing from sai, state COMPLETE root@lancelot-laptop:/home/lancelot# tracejob 78 /var/spool/torque/mom_logs/20120724: No matching job records located Job: 78.lancelot-laptop 07/24/2012 16:25:51 S enqueuing into sai, state 1 hop 1 07/24/2012 16:25:51 S Job Queued at request of lancelot@cfa, owner = lancelot@cfa, job name = job3, queue = sai 07/24/2012 16:25:51 A queue=sai 07/24/2012 16:25:56 S Job Modified at request of Scheduler@lancelot-laptop 07/24/2012 16:25:56 L Not enough of the right type of nodes available root@lancelot-laptop:/home/lancelot# qstat -f 76 Job Id: 76.lancelot-laptop Job_Name = job3 Job_Owner = lancelot@cfa resources_used.cput = 00:00:00 resources_used.mem = 9304kb resources_used.vmem = 478176kb resources_used.walltime = 01:15:51 job_state = R queue = sai server = lancelot-laptop Checkpoint = u ctime = Tue Jul 24 15:01:03 2012 Error_Path = cfa:/home/lancelot/job3.err exec_host = cfa/0+lancelot-laptop/0 exec_port = 15003+15003 Hold_Types = n Join_Path = n Keep_Files = n Mail_Points = a mtime = Tue Jul 24 15:01:57 2012 Output_Path = cfa:/home/lancelot/job3.log Priority = 0 qtime = Tue Jul 24 15:01:03 2012 Rerunable = True Resource_List.cput = 01:00:00 Resource_List.neednodes = 2 Resource_List.nodect = 2 Resource_List.nodes = 2 Resource_List.walltime = 24:00:00 session_id = 752 substate = 42 Variable_List = PBS_O_QUEUE=sai,PBS_O_HOST=cfa,PBS_O_HOME=/home/lancelot, PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=lancelot, PBS_O_PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/ sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/torque/bin:/usr/local/t orque/sbin:/usr/local/maui/bin:/usr/local/maui/sbin:/usr/java/jdk1.6.0 _33/bin:/home/shu/software/mpich2-1.4:/root/bin:/usr/local/torque/bin: /usr/local/maui/bin:/usr/java/jdk1.6.0_33/bin:/home/shu/software/mpich 2-1.4,PBS_O_MAIL=/var/spool/mail/lancelot,PBS_O_SHELL=/bin/bash, PBS_SERVER=lancelot-laptop,PBS_O_WORKDIR=/home/lancelot, TOMCAT_HOME=/home/shu/software/apache-tomcat-7.0.29,HOSTNAME=cfa, SHELL=/bin/bash,TERM=xterm,HISTSIZE=1000, SSH_CLIENT=192.168.0.46 58198 22,QTDIR=/usr/lib64/qt-3.3, QTINC=/usr/lib64/qt-3.3/include,SSH_TTY=/dev/pts/6,USER=lancelot, LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd= 40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=3 0;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj =01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*. zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01 ;31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=0 1;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpi o=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.b mp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:* .xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01; 35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v= 01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.v ob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.r mvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:* .dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35: *.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36 :*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=0 1;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx= 01;36:*.xspf=01;36:, PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/ usr/bin:/sbin:/bin:/usr/games:/usr/local/torque/bin:/usr/local/torque/ sbin:/usr/local/maui/bin:/usr/local/maui/sbin:/usr/java/jdk1.6.0_33/bi n:/home/shu/software/mpich2-1.4:/root/bin:/usr/local/torque/bin:/usr/l ocal/maui/bin:/usr/java/jdk1.6.0_33/bin:/home/shu/software/mpich2-1.4, MAIL=/var/spool/mail/lancelot,PWD=/home/lancelot, JAVA_HOME=/usr/java/jdk1.6.0_33,LANG=en_US.UTF-8, HISTCONTROL=ignoredups, SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass, HOME=/home/lancelot,SHLVL=6,LOGNAME=lancelot,CVS_RSH=ssh, QTLIB=/usr/lib64/qt-3.3/lib, SSH_CONNECTION=192.168.0.46 58198 192.168.0.111 22, CLASSPATH=.:/usr/java/jdk1.6.0_33/jre/lib/rt.jar:/usr/java/jdk1.6.0_3 3/lib/dt.jar:/usr/java/jdk1.6.0_33/lib/tools.jar, LESSOPEN=|/usr/bin/lesspipe.sh %s,TORQUE=/usr/local/torque, MAUI=/usr/local/maui,G_BROKEN_FILENAMES=1,_=/usr/local/bin/qsub euser = lancelot egroup = lancelot hashname = 76.lancelot-laptop queue_rank = 19 queue_type = E comment = Job started on Tue Jul 24 at 15:01 etime = Tue Jul 24 15:01:03 2012 submit_args = job3.pbs start_time = Tue Jul 24 15:01:03 2012 Walltime.Remaining = 76692 start_count = 1 fault_tolerant = False submit_host = cfa init_work_dir = /home/lancelot root@lancelot-laptop:/home/lancelot# qstat Job id Name User Time Use S Queue ------------------------- ---------------- --------------- -------- - ----- 76.lancelot-laptop job3 lancelot 00:00:00 R sai 78.lancelot-laptop job3 lancelot 0 Q sai root@lancelot-laptop:/home/lancelot# qsub --version version: 3.0.3 root@lancelot-laptop:/home/lancelot# qmgr -c 'p s' # # Create queues and set their attributes. # # # Create and define queue sai # create queue sai set queue sai queue_type = Execution set queue sai acl_groups = lancelot-laptop set queue sai acl_group_sloppy = True set queue sai route_destinations = lancelot-laptop set queue sai enabled = True set queue sai started = True # # Set server attributes. # set server scheduling = True set server acl_hosts = lancelot-laptop set server managers = lancelot@lancelot-laptop set server operators = lancelot@lancelot-laptop set server default_queue = sai set server log_events = 511 set server mail_from = adm set server scheduler_iteration = 600 set server node_check_rate = 150 set server tcp_timeout = 6 set server mom_job_sync = True set server keep_completed = 300 set server next_job_number = 79 root@lancelot-laptop:/home/lancelot# qmgr -c "list queue sai" Queue sai queue_type = Execution total_jobs = 2 state_count = Transit:0 Queued:1 Held:0 Waiting:0 Running:1 Exiting:0 acl_groups = lancelot-laptop acl_group_sloppy = True mtime = Tue Jul 24 11:31:37 2012 resources_assigned.nodect = 2 route_destinations = lancelot-laptop enabled = True started = True root@lancelot-laptop:/home/lancelot# cat /var/spool/torque/server_priv/nodes lancelot-laptop np=2 cfa np=12 tom np=2 root@lancelot-laptop:/home/lancelot# cat /var/spool/torque/mom_priv/config $pbsserver lancelot-laptop $logevent 255 cat server_name lancelot-laptop root@lancelot-laptop:/home/lancelot# qstat -Q Queue Max Tot Ena Str Que Run Hld Wat Trn Ext T ---------------- --- --- --- --- --- --- --- --- --- --- - sai 0 2 yes yes 1 1 0 0 0 0 E root@lancelot-laptop:/home/lancelot# qstat -q server: lancelot-laptop Queue Memory CPU Time Walltime Node Run Que Lm State ---------------- ------ -------- -------- ---- --- --- -- ----- sai -- -- -- -- 1 1 -- E R ----- ----- 1 1 root@lancelot-laptop:/home/lancelot# qstat -B Server Max Tot Que Run Hld Wat Trn Ext Status ---------------- --- --- --- --- --- --- --- --- ---------- lancelot-laptop 0 2 1 1 0 0 0 0 Active mom_logs: 07/24/2012 14:53:43;0002; pbs_mom;Svr;im_eof;End of File from addr 192.168.0.111:1023 07/24/2012 14:53:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 14:54:44;0001; pbs_mom;Svr;pbs_mom;LOG_DEBUG::mom_checkpoint_job_has_checkpoint, FALSE 07/24/2012 14:54:44;0001; pbs_mom;Job;TMomFinalizeJob3;job 75.lancelot-laptop started, pid = 3807 07/24/2012 14:55:29;0080; pbs_mom;Job;75.lancelot-laptop;scan_for_terminated: job 75.lancelot-laptop task 1 terminated, sid=3807 07/24/2012 14:55:29;0008; pbs_mom;Job;75.lancelot-laptop;job was terminated 07/24/2012 14:55:29;0080; pbs_mom;Svr;preobit_reply;top of preobit_reply 07/24/2012 14:55:29;0080; pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop 07/24/2012 14:55:29;0080; pbs_mom;Svr;preobit_reply;in while loop, no error from job stat 07/24/2012 14:55:29;0080; pbs_mom;Job;75.lancelot-laptop;obit sent to server 07/24/2012 14:55:29;0080; pbs_mom;Job;75.lancelot-laptop;removed job script 07/24/2012 14:58:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:01:03;0008; pbs_mom;Job;76.lancelot-laptop;JOIN JOB as node 1 07/24/2012 15:03:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:08:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:13:11;0001; pbs_mom;Svr;pbs_mom;LOG_DEBUG::mom_checkpoint_job_has_checkpoint, FALSE 07/24/2012 15:13:11;0001; pbs_mom;Job;TMomFinalizeJob3;job 77.lancelot-laptop started, pid = 4008 07/24/2012 15:13:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:13:56;0080; pbs_mom;Job;77.lancelot-laptop;scan_for_terminated: job 77.lancelot-laptop task 1 terminated, sid=4008 07/24/2012 15:13:56;0008; pbs_mom;Job;77.lancelot-laptop;job was terminated 07/24/2012 15:13:56;0080; pbs_mom;Svr;preobit_reply;top of preobit_reply 07/24/2012 15:13:56;0080; pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop 07/24/2012 15:13:56;0080; pbs_mom;Svr;preobit_reply;in while loop, no error from job stat 07/24/2012 15:13:56;0080; pbs_mom;Job;77.lancelot-laptop;obit sent to server 07/24/2012 15:13:57;0080; pbs_mom;Job;77.lancelot-laptop;removed job script 07/24/2012 15:18:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:23:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:28:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0 07/24/2012 15:33:50;0002; pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0