#!/bin/bash set -u # Usage: gsub_wait job=${1:-} if [[ -z "$job" ]]; then echo "Usage: $0 " exit 1 fi # ========================================== # 1. 提交任务 (Submit Job) # ========================================== # 确定 gsub 命令位置 # 优先查找当前目录下的 gsub,否则查找 PATH if [[ -x "./gsub" ]]; then GSUB_CMD="./gsub" else GSUB_CMD="gsub" fi # 调用 gsub 并捕获输出 # 注意:gsub 内部可能通过 SSH 在远程执行,最终返回 qsub 的输出 output=$($GSUB_CMD "$job") echo "$output" # ========================================== # 2. 检查是否需要等待 (Check Silent Mode) # ========================================== # 如果 GSUB_SILENT 为 1,则不进行监控,直接退出 if [[ "${GSUB_SILENT:-0}" == "1" ]]; then exit 0 fi # ========================================== # 3. 监控任务进度 (Monitor Progress) # ========================================== # 尝试提取 Job ID (例如: 67147.cluster -> 67147) jobid_full=$(echo "$output" | grep -oE '[0-9]+\.cluster|[0-9]+' | head -n 1 || true) if [[ -n "$jobid_full" ]]; then jobid=${jobid_full%%.*} # 准备参数 out_file="$job.out" gin_file="$job.gin" end_file="$job.job.o$jobid" if [[ ! -f "$gin_file" ]]; then # 如果 gin 文件找不到(可能是远程路径问题?),跳过监控 echo "Warning: $gin_file not found nearby. Skipping monitor." exit 0 fi # 计算 Total Steps: (--link1-- 数量) + 1 link_count=$(grep -c -- "--link1--" "$gin_file" || true) total=$((link_count + 1)) cntDone=0 cntSCF=0 last_lines=0 echo "Monitoring Job $jobid..." while true; do # A. 检查 PBS 结束文件 (Job 完成标志) if [[ -f "$end_file" ]]; then echo "Job finished (found $end_file)." break fi # B. 检查并读取 .out 输出文件 if [[ -f "$out_file" ]]; then curr_lines=$(wc -l < "$out_file" 2>/dev/null || echo 0) # 如果文件变小(被截断或重新生成),重置读取位置 if (( curr_lines < last_lines )); then last_lines=0; fi if (( curr_lines > last_lines )); then # 逐行处理新增内容 # 使用进程替换 < <(...) 避免管道导致的子shell变量丢失问题 while IFS= read -r line; do # 检查 SCF Done # 正则匹配: SCF Done: ... E ... = (数值) A.U. if [[ "$line" =~ SCF[[:space:]]Done:.*E.*=[[:space:]]*([-0-9.]+)[[:space:]]*A\.U\. ]]; then energy="${BASH_REMATCH[1]}" cntSCF=$((cntSCF + 1)) echo "$job: SCF Done: $energy [$cntSCF] ($cntDone/$total)" fi # 检查 Termination if [[ "$line" == *"termination of Gaussian"* ]]; then cntDone=$((cntDone + 1)) echo "$job: task done ($cntDone/$total)" fi done < <(tail -n "+$((last_lines + 1))" "$out_file") last_lines=$curr_lines fi fi sleep 2 done # C. 最终校验 if (( cntDone != total )); then echo "Warning: cntDone ($cntDone) != total ($total)" fi else echo "Could not parse Job ID from output. Monitor skipped." fi