<table cellspacing="0" cellpadding="0" border="0" ><tr><td valign="top" style="font: inherit;"><pre>I have found the problem, thx! <br>I failed to  install MPI into the same path! <br><br>followed my configuration and error message .<br><br><br>EMAILS:
PBS JOB ID:76.lancelot-laptop

[lancelot@cfa ~]$ cat job3.pbs
#!/bin/bash
#PBS -N job3
#PBS -o job3.log
#PBS -e job3.err
#PBS -q sai
#PBA -I
#PBS -l nodes=2:ppn=2
#PBS -l walltime=24:00:00
#PBS -l cput=1:00:00
#PBS -V
cd /home/lancelot
echo running on hosts `hostname`
echo time is `date`
echo directory is $PWD
echo job runs on the nodes:
cat $PBS_NODEFILE
NPROCS=`wc -l &lt; $PBS_NODEFILE`
echo this job has allocated $NPROCS nodes
mpiexec -np 4 ./prog

[lancelot@cfa ~]$ cat prog
#!/bin/bash
echo 999999999|./icpi


root@lancelot-laptop:/home/lancelot# pbsnodes
lancelot-laptop
     state = free
     np = 2
     ntype = cluster
     jobs = 0/76.lancelot-laptop
     status = rectime=1343122703,varattr=,jobs=76.lancelot-laptop,state=free,netload=95261305,gres=,loadave=0.57,ncpus=2,physmem=1542608kb,availmem=2981784kb,totmem=3494344kb,idletime=14158,nusers=2,nsessions=13,sessions=1100 792 1309 1349 1365 1374 1384 1439 1452 1682 1749 1798 2737,uname=Linux lancelot-laptop 2.6.32-41-generic #94-Ubuntu SMP Fri Jul 6 16:51:39 UTC 2012 i686,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

cfa
     state = free
     np = 12
     ntype = cluster
     jobs = 0/76.lancelot-laptop
     status = rectime=1343122703,varattr=,jobs=76.lancelot-laptop,state=free,netload=492745850,gres=,loadave=0.00,ncpus=12,physmem=8015456kb,availmem=22517440kb,totmem=24399448kb,idletime=2992,nusers=5,nsessions=58,sessions=18335 469 27670 752 18344 834 1171 1982 2226 3403 2290 14058 14160 14359 14579 15144 15464 15698 15913 16121 16201 16444 16988 17058 17603 18048 18278 18378 18379 18405 18411 18479 18557 18884 19096 22028 22149 22256 22257 22283 22290 22347 27347 27515 27561 30703 30712 30795 30797 30823 30829 30905 32454 32458 32459 32467 32469 32489,uname=Linux cfa 2.6.32-220.el6.x86_64 #1 SMP Tue Dec 6 19:48:22 GMT 2011 x86_64,opsys=linux
     mom_service_port = 15002
     mom_manager_port = 15003
     gpus = 0

root@lancelot-laptop:/home/lancelot# tracejob 76

Job: 76.lancelot-laptop

07/24/2012 15:01:03  M    JOIN JOB as node 1
07/24/2012 15:01:03  S    enqueuing into sai, state 1 hop 1
07/24/2012 15:01:03  S    Job Queued at request of lancelot@cfa, owner =
                          lancelot@cfa, job name = job3, queue = sai
07/24/2012 15:01:03  S    Job Modified at request of Scheduler@lancelot-laptop
07/24/2012 15:01:03  L    Job Run
07/24/2012 15:01:03  S    Job Run at request of Scheduler@lancelot-laptop
07/24/2012 15:01:03  A    queue=sai
07/24/2012 15:01:03  A    user=lancelot group=lancelot jobname=job3 queue=sai
                          ctime=1343113263 qtime=1343113263 etime=1343113263
                          start=1343113263 owner=lancelot@cfa
                          exec_host=cfa/0+lancelot-laptop/0
                          Resource_List.cput=01:00:00 Resource_List.neednodes=2
                          Resource_List.nodect=2 Resource_List.nodes=2
                          Resource_List.walltime=24:00:00 
07/24/2012 15:01:57  S    Not sending email: User does not want mail of this
                          type.

root@lancelot-laptop:/home/lancelot# tracejob 77

Job: 77.lancelot-laptop

07/24/2012 15:13:11  S    enqueuing into sai, state 1 hop 1
07/24/2012 15:13:11  S    Job Queued at request of lancelot@cfa, owner =
                          lancelot@cfa, job name = job4, queue = sai
07/24/2012 15:13:11  S    Job Modified at request of Scheduler@lancelot-laptop
07/24/2012 15:13:11  L    Job Run
07/24/2012 15:13:11  S    Job Run at request of Scheduler@lancelot-laptop
07/24/2012 15:13:11  S    Not sending email: User does not want mail of this
                          type.
07/24/2012 15:13:11  A    queue=sai
07/24/2012 15:13:11  A    user=lancelot group=lancelot jobname=job4 queue=sai
                          ctime=1343113991 qtime=1343113991 etime=1343113991
                          start=1343113991 owner=lancelot@cfa
                          exec_host=lancelot-laptop/1
                          Resource_List.cput=01:00:00
                          Resource_List.walltime=24:00:00 
07/24/2012 15:13:56  S    Not sending email: User does not want mail of this
                          type.
07/24/2012 15:13:56  S    Exit_status=0 resources_used.cput=00:00:45
                          resources_used.mem=5300kb resources_used.vmem=19680kb
                          resources_used.walltime=00:00:45
07/24/2012 15:13:56  M    scan_for_terminated: job 77.lancelot-laptop task 1
                          terminated, sid=4008
07/24/2012 15:13:56  M    job was terminated
07/24/2012 15:13:56  M    obit sent to server
07/24/2012 15:13:56  A    user=lancelot group=lancelot jobname=job4 queue=sai
                          ctime=1343113991 qtime=1343113991 etime=1343113991
                          start=1343113991 owner=lancelot@cfa
                          exec_host=lancelot-laptop/1
                          Resource_List.cput=01:00:00
                          Resource_List.walltime=24:00:00 session=4008
                          end=1343114036 Exit_status=0
                          resources_used.cput=00:00:45 resources_used.mem=5300kb
                          resources_used.vmem=19680kb
                          resources_used.walltime=00:00:45
07/24/2012 15:13:57  M    removed job script
07/24/2012 15:18:57  S    dequeuing from sai, state COMPLETE


root@lancelot-laptop:/home/lancelot# tracejob 78
/var/spool/torque/mom_logs/20120724: No matching job records located

Job: 78.lancelot-laptop

07/24/2012 16:25:51  S    enqueuing into sai, state 1 hop 1
07/24/2012 16:25:51  S    Job Queued at request of lancelot@cfa, owner =
                          lancelot@cfa, job name = job3, queue = sai
07/24/2012 16:25:51  A    queue=sai
07/24/2012 16:25:56  S    Job Modified at request of Scheduler@lancelot-laptop
07/24/2012 16:25:56  L    Not enough of the right type of nodes available


root@lancelot-laptop:/home/lancelot# qstat -f 76
Job Id: 76.lancelot-laptop
    Job_Name = job3
    Job_Owner = lancelot@cfa
    resources_used.cput = 00:00:00
    resources_used.mem = 9304kb
    resources_used.vmem = 478176kb
    resources_used.walltime = 01:15:51
    job_state = R
    queue = sai
    server = lancelot-laptop
    Checkpoint = u
    ctime = Tue Jul 24 15:01:03 2012
    Error_Path = cfa:/home/lancelot/job3.err
    exec_host = cfa/0+lancelot-laptop/0
    exec_port = 15003+15003
    Hold_Types = n
    Join_Path = n
    Keep_Files = n
    Mail_Points = a
    mtime = Tue Jul 24 15:01:57 2012
    Output_Path = cfa:/home/lancelot/job3.log
    Priority = 0
    qtime = Tue Jul 24 15:01:03 2012
    Rerunable = True
    Resource_List.cput = 01:00:00
    Resource_List.neednodes = 2
    Resource_List.nodect = 2
    Resource_List.nodes = 2
    Resource_List.walltime = 24:00:00
    session_id = 752
    substate = 42
    Variable_List = PBS_O_QUEUE=sai,PBS_O_HOST=cfa,PBS_O_HOME=/home/lancelot,
        PBS_O_LANG=en_US.UTF-8,PBS_O_LOGNAME=lancelot,
        PBS_O_PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/
        sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/torque/bin:/usr/local/t
        orque/sbin:/usr/local/maui/bin:/usr/local/maui/sbin:/usr/java/jdk1.6.0
        _33/bin:/home/shu/software/mpich2-1.4:/root/bin:/usr/local/torque/bin:
        /usr/local/maui/bin:/usr/java/jdk1.6.0_33/bin:/home/shu/software/mpich
        2-1.4,PBS_O_MAIL=/var/spool/mail/lancelot,PBS_O_SHELL=/bin/bash,
        PBS_SERVER=lancelot-laptop,PBS_O_WORKDIR=/home/lancelot,
        TOMCAT_HOME=/home/shu/software/apache-tomcat-7.0.29,HOSTNAME=cfa,
        SHELL=/bin/bash,TERM=xterm,HISTSIZE=1000,
        SSH_CLIENT=192.168.0.46 58198 22,QTDIR=/usr/lib64/qt-3.3,
        QTINC=/usr/lib64/qt-3.3/include,SSH_TTY=/dev/pts/6,USER=lancelot,
        LS_COLORS=rs=0:di=01;34:ln=01;36:mh=00:pi=40;33:so=01;35:do=01;35:bd=
        40;33;01:cd=40;33;01:or=40;31;01:mi=01;05;37;41:su=37;41:sg=30;43:ca=3
        0;41:tw=30;42:ow=34;42:st=37;44:ex=01;32:*.tar=01;31:*.tgz=01;31:*.arj
        =01;31:*.taz=01;31:*.lzh=01;31:*.lzma=01;31:*.tlz=01;31:*.txz=01;31:*.
        zip=01;31:*.z=01;31:*.Z=01;31:*.dz=01;31:*.gz=01;31:*.lz=01;31:*.xz=01
        31:*.bz2=01;31:*.tbz=01;31:*.tbz2=01;31:*.bz=01;31:*.tz=01;31:*.deb=0
        1;31:*.rpm=01;31:*.jar=01;31:*.rar=01;31:*.ace=01;31:*.zoo=01;31:*.cpi
        o=01;31:*.7z=01;31:*.rz=01;31:*.jpg=01;35:*.jpeg=01;35:*.gif=01;35:*.b
        mp=01;35:*.pbm=01;35:*.pgm=01;35:*.ppm=01;35:*.tga=01;35:*.xbm=01;35:*
        .xpm=01;35:*.tif=01;35:*.tiff=01;35:*.png=01;35:*.svg=01;35:*.svgz=01;
        35:*.mng=01;35:*.pcx=01;35:*.mov=01;35:*.mpg=01;35:*.mpeg=01;35:*.m2v=
        01;35:*.mkv=01;35:*.ogm=01;35:*.mp4=01;35:*.m4v=01;35:*.mp4v=01;35:*.v
        ob=01;35:*.qt=01;35:*.nuv=01;35:*.wmv=01;35:*.asf=01;35:*.rm=01;35:*.r
        mvb=01;35:*.flc=01;35:*.avi=01;35:*.fli=01;35:*.flv=01;35:*.gl=01;35:*
        .dl=01;35:*.xcf=01;35:*.xwd=01;35:*.yuv=01;35:*.cgm=01;35:*.emf=01;35:
        *.axv=01;35:*.anx=01;35:*.ogv=01;35:*.ogx=01;35:*.aac=01;36:*.au=01;36
        :*.flac=01;36:*.mid=01;36:*.midi=01;36:*.mka=01;36:*.mp3=01;36:*.mpc=0
        1;36:*.ogg=01;36:*.ra=01;36:*.wav=01;36:*.axa=01;36:*.oga=01;36:*.spx=
        01;36:*.xspf=01;36:,
        PATH=/usr/lib64/qt-3.3/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/
        usr/bin:/sbin:/bin:/usr/games:/usr/local/torque/bin:/usr/local/torque/
        sbin:/usr/local/maui/bin:/usr/local/maui/sbin:/usr/java/jdk1.6.0_33/bi
        n:/home/shu/software/mpich2-1.4:/root/bin:/usr/local/torque/bin:/usr/l
        ocal/maui/bin:/usr/java/jdk1.6.0_33/bin:/home/shu/software/mpich2-1.4,
        MAIL=/var/spool/mail/lancelot,PWD=/home/lancelot,
        JAVA_HOME=/usr/java/jdk1.6.0_33,LANG=en_US.UTF-8,
        HISTCONTROL=ignoredups,
        SSH_ASKPASS=/usr/libexec/openssh/gnome-ssh-askpass,
        HOME=/home/lancelot,SHLVL=6,LOGNAME=lancelot,CVS_RSH=ssh,
        QTLIB=/usr/lib64/qt-3.3/lib,
        SSH_CONNECTION=192.168.0.46 58198 192.168.0.111 22,
        CLASSPATH=.:/usr/java/jdk1.6.0_33/jre/lib/rt.jar:/usr/java/jdk1.6.0_3
        3/lib/dt.jar:/usr/java/jdk1.6.0_33/lib/tools.jar,
        LESSOPEN=|/usr/bin/lesspipe.sh %s,TORQUE=/usr/local/torque,
        MAUI=/usr/local/maui,G_BROKEN_FILENAMES=1,_=/usr/local/bin/qsub
    euser = lancelot
    egroup = lancelot
    hashname = 76.lancelot-laptop
    queue_rank = 19
    queue_type = E
    comment = Job started on Tue Jul 24 at 15:01
    etime = Tue Jul 24 15:01:03 2012
    submit_args = job3.pbs
    start_time = Tue Jul 24 15:01:03 2012
    Walltime.Remaining = 76692
    start_count = 1
    fault_tolerant = False
    submit_host = cfa
    init_work_dir = /home/lancelot

root@lancelot-laptop:/home/lancelot# qstat
Job id                    Name             User            Time Use S Queue
------------------------- ---------------- --------------- -------- - -----
76.lancelot-laptop         job3             lancelot        00:00:00 R sai            
78.lancelot-laptop         job3             lancelot               0 Q sai            

root@lancelot-laptop:/home/lancelot# qsub --version
version: 3.0.3



root@lancelot-laptop:/home/lancelot# qmgr -c 'p s'
#
# Create queues and set their attributes.
#
#
# Create and define queue sai
#
create queue sai
set queue sai queue_type = Execution
set queue sai acl_groups = lancelot-laptop
set queue sai acl_group_sloppy = True
set queue sai route_destinations = lancelot-laptop
set queue sai enabled = True
set queue sai started = True
#
# Set server attributes.
#
set server scheduling = True
set server acl_hosts = lancelot-laptop
set server managers = lancelot@lancelot-laptop
set server operators = lancelot@lancelot-laptop
set server default_queue = sai
set server log_events = 511
set server mail_from = adm
set server scheduler_iteration = 600
set server node_check_rate = 150
set server tcp_timeout = 6
set server mom_job_sync = True
set server keep_completed = 300
set server next_job_number = 79


root@lancelot-laptop:/home/lancelot# qmgr -c "list queue sai"
Queue sai
        queue_type = Execution
        total_jobs = 2
        state_count = Transit:0 Queued:1 Held:0 Waiting:0 Running:1 Exiting:0 
        acl_groups = lancelot-laptop
        acl_group_sloppy = True
        mtime = Tue Jul 24 11:31:37 2012
        resources_assigned.nodect = 2
        route_destinations = lancelot-laptop
        enabled = True
        started = True


root@lancelot-laptop:/home/lancelot# cat /var/spool/torque/server_priv/nodes
lancelot-laptop np=2
cfa np=12
tom np=2

root@lancelot-laptop:/home/lancelot# cat /var/spool/torque/mom_priv/config
$pbsserver   lancelot-laptop
$logevent   255


cat server_name
lancelot-laptop


root@lancelot-laptop:/home/lancelot# qstat -Q
Queue              Max   Tot   Ena   Str   Que   Run   Hld   Wat   Trn   Ext T         
----------------   ---   ---   ---   ---   ---   ---   ---   ---   ---   --- -         
sai                  0     2   yes   yes     1     1     0     0     0     0 E         
root@lancelot-laptop:/home/lancelot# qstat -q

server: lancelot-laptop

Queue            Memory CPU Time Walltime Node  Run Que Lm  State
---------------- ------ -------- -------- ----  --- --- --  -----
sai                --      --       --      --    1   1 --   E R
                                               ----- -----
                                                   1     1
root@lancelot-laptop:/home/lancelot# qstat -B
Server             Max   Tot   Que   Run   Hld   Wat   Trn   Ext Status    
----------------   ---   ---   ---   ---   ---   ---   ---   --- ----------
lancelot-laptop      0     2     1     1     0     0     0     0 Active    


mom_logs:

07/24/2012 14:53:43;0002;   pbs_mom;Svr;im_eof;End of File from addr 192.168.0.111:1023
07/24/2012 14:53:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 14:54:44;0001;   pbs_mom;Svr;pbs_mom;LOG_DEBUG::mom_checkpoint_job_has_checkpoint, FALSE
07/24/2012 14:54:44;0001;   pbs_mom;Job;TMomFinalizeJob3;job 75.lancelot-laptop started, pid = 3807
07/24/2012 14:55:29;0080;   pbs_mom;Job;75.lancelot-laptop;scan_for_terminated: job 75.lancelot-laptop task 1 terminated, sid=3807
07/24/2012 14:55:29;0008;   pbs_mom;Job;75.lancelot-laptop;job was terminated
07/24/2012 14:55:29;0080;   pbs_mom;Svr;preobit_reply;top of preobit_reply
07/24/2012 14:55:29;0080;   pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop
07/24/2012 14:55:29;0080;   pbs_mom;Svr;preobit_reply;in while loop, no error from job stat
07/24/2012 14:55:29;0080;   pbs_mom;Job;75.lancelot-laptop;obit sent to server
07/24/2012 14:55:29;0080;   pbs_mom;Job;75.lancelot-laptop;removed job script
07/24/2012 14:58:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:01:03;0008;   pbs_mom;Job;76.lancelot-laptop;JOIN JOB as node 1
07/24/2012 15:03:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:08:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:13:11;0001;   pbs_mom;Svr;pbs_mom;LOG_DEBUG::mom_checkpoint_job_has_checkpoint, FALSE
07/24/2012 15:13:11;0001;   pbs_mom;Job;TMomFinalizeJob3;job 77.lancelot-laptop started, pid = 4008
07/24/2012 15:13:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:13:56;0080;   pbs_mom;Job;77.lancelot-laptop;scan_for_terminated: job 77.lancelot-laptop task 1 terminated, sid=4008
07/24/2012 15:13:56;0008;   pbs_mom;Job;77.lancelot-laptop;job was terminated
07/24/2012 15:13:56;0080;   pbs_mom;Svr;preobit_reply;top of preobit_reply
07/24/2012 15:13:56;0080;   pbs_mom;Svr;preobit_reply;DIS_reply_read/decode_DIS_replySvr worked, top of while loop
07/24/2012 15:13:56;0080;   pbs_mom;Svr;preobit_reply;in while loop, no error from job stat
07/24/2012 15:13:56;0080;   pbs_mom;Job;77.lancelot-laptop;obit sent to server
07/24/2012 15:13:57;0080;   pbs_mom;Job;77.lancelot-laptop;removed job script
07/24/2012 15:18:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:23:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:28:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0
07/24/2012 15:33:50;0002;   pbs_mom;Svr;pbs_mom;Torque Mom Version = 3.0.3, loglevel = 0


</pre></td></tr></table>