117 lines
3.2 KiB
Bash
117 lines
3.2 KiB
Bash
#!/bin/bash
|
||
set -u
|
||
|
||
# Usage: gsub_wait <jobname>
|
||
|
||
job=${1:-}
|
||
if [[ -z "$job" ]]; then
|
||
echo "Usage: $0 <jobname-without-extension>"
|
||
exit 1
|
||
fi
|
||
|
||
# ==========================================
|
||
# 1. 提交任务 (Submit Job)
|
||
# ==========================================
|
||
|
||
# 确定 gsub 命令位置
|
||
# 优先查找当前目录下的 gsub,否则查找 PATH
|
||
if [[ -x "./gsub" ]]; then
|
||
GSUB_CMD="./gsub"
|
||
else
|
||
GSUB_CMD="gsub"
|
||
fi
|
||
|
||
# 调用 gsub 并捕获输出
|
||
# 注意:gsub 内部可能通过 SSH 在远程执行,最终返回 qsub 的输出
|
||
output=$($GSUB_CMD "$job")
|
||
echo "$output"
|
||
|
||
# ==========================================
|
||
# 2. 检查是否需要等待 (Check Silent Mode)
|
||
# ==========================================
|
||
# 如果 GSUB_SILENT 为 1,则不进行监控,直接退出
|
||
if [[ "${GSUB_SILENT:-0}" == "1" ]]; then
|
||
exit 0
|
||
fi
|
||
|
||
# ==========================================
|
||
# 3. 监控任务进度 (Monitor Progress)
|
||
# ==========================================
|
||
|
||
# 尝试提取 Job ID (例如: 67147.cluster -> 67147)
|
||
jobid_full=$(echo "$output" | grep -oE '[0-9]+\.cluster|[0-9]+' | head -n 1 || true)
|
||
|
||
if [[ -n "$jobid_full" ]]; then
|
||
jobid=${jobid_full%%.*}
|
||
|
||
# 准备参数
|
||
out_file="$job.out"
|
||
gin_file="$job.gin"
|
||
end_file="$job.job.o$jobid"
|
||
|
||
if [[ ! -f "$gin_file" ]]; then
|
||
# 如果 gin 文件找不到(可能是远程路径问题?),跳过监控
|
||
echo "Warning: $gin_file not found nearby. Skipping monitor."
|
||
exit 0
|
||
fi
|
||
|
||
# 计算 Total Steps: (--link1-- 数量) + 1
|
||
link_count=$(grep -c -- "--link1--" "$gin_file" || true)
|
||
total=$((link_count + 1))
|
||
cntDone=0
|
||
cntSCF=0
|
||
|
||
last_lines=0
|
||
|
||
echo "Monitoring Job $jobid..."
|
||
|
||
while true; do
|
||
# A. 检查 PBS 结束文件 (Job 完成标志)
|
||
if [[ -f "$end_file" ]]; then
|
||
echo "Job finished (found $end_file)."
|
||
break
|
||
fi
|
||
|
||
# B. 检查并读取 .out 输出文件
|
||
if [[ -f "$out_file" ]]; then
|
||
curr_lines=$(wc -l < "$out_file" 2>/dev/null || echo 0)
|
||
|
||
# 如果文件变小(被截断或重新生成),重置读取位置
|
||
if (( curr_lines < last_lines )); then last_lines=0; fi
|
||
|
||
if (( curr_lines > last_lines )); then
|
||
# 逐行处理新增内容
|
||
# 使用进程替换 < <(...) 避免管道导致的子shell变量丢失问题
|
||
while IFS= read -r line; do
|
||
|
||
# 检查 SCF Done
|
||
# 正则匹配: SCF Done: ... E ... = (数值) A.U.
|
||
if [[ "$line" =~ SCF[[:space:]]Done:.*E.*=[[:space:]]*([-0-9.]+)[[:space:]]*A\.U\. ]]; then
|
||
energy="${BASH_REMATCH[1]}"
|
||
cntSCF=$((cntSCF + 1))
|
||
echo "$job: SCF Done: $energy [$cntSCF] ($cntDone/$total)"
|
||
fi
|
||
|
||
# 检查 Termination
|
||
if [[ "$line" == *"termination of Gaussian"* ]]; then
|
||
cntDone=$((cntDone + 1))
|
||
echo "$job: task done ($cntDone/$total)"
|
||
fi
|
||
|
||
done < <(tail -n "+$((last_lines + 1))" "$out_file")
|
||
|
||
last_lines=$curr_lines
|
||
fi
|
||
fi
|
||
|
||
sleep 2
|
||
done
|
||
|
||
# C. 最终校验
|
||
if (( cntDone != total )); then
|
||
echo "Warning: cntDone ($cntDone) != total ($total)"
|
||
fi
|
||
else
|
||
echo "Could not parse Job ID from output. Monitor skipped."
|
||
fi
|