tmp
-
#!/bin/bash
1. 定义日志文件路径(默认是当前目录的nohup.out,可根据实际路径修改)
LOG_FILE="./nohup.out"
2. 检查日志文件是否存在
if [ ! -f "$LOG_FILE" ]; then
echo "错误:日志文件 $LOG_FILE 不存在!请检查路径是否正确。"
exit 1
fi3. 实时监听日志 + 提取目标任务ID(UUID)
echo "=== 开始监听日志 $LOG_FILE,提取含 Restarting 的任务ID ==="
echo "=== 按 Ctrl+C 停止监听 ==="
echo "=========================="核心逻辑:
- tail -f:实时跟踪日志新增内容
- grep "Restarting":筛选包含“Restarting”的行
- sed 正则:提取“WorkflowActor-”后的36位UUID(格式:8-4-4-4-12位字符)
- sort -u:去重(避免同一任务多次重启导致重复输出)
tail -f "$LOG_FILE" |
grep --line-buffered "Restarting" |
sed -n 's/.WorkflowActor-([0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12})./\1/p' |
sort -u -
estarting micro_dy_gro.upstream
[INFO] [09/23/2025 06:46:37.396] [cromwell-system-akka.dispatchers.engine-dispatcher-9] [akka://cromwell-system/user/cromwell-service/WorkflowManagerActor/WorkflowActor-fa3a52b6-19db-4435-ac0f-a5c1fbeec385/WorkflowExecutionActor-fa3a52b6-19db-4435-ac0f-a5c1fbeec385] WorkflowExecutionActor-fa3a52b6-19db-4435-ac0f-a5c1fbeec385 [UUID(fa3a52b6)]: Restarting blood_meta.jsonFile, blood_meta.reportNoFile, blood_meta.resFile
################# retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9998) ################################## retry : Some(9998) ################################## retry : Some(9999) ################################## retry : Some(9999) ################################## retry : Some(9999) ############### -
Type Reason Age From Message
Normal NodeReady 47m (x11 over 6h56m) kubelet Node node1 status is now: NodeReady
Normal NodeNotReady 44m (x12 over 7h3m) kubelet Node node1 status is now: NodeNotReady
Normal Starting 37m kubelet Starting kubelet.
Normal NodeHasSufficientMemory 37m kubelet Node node1 status is now: NodeHasSufficientMemory
Normal NodeHasNoDiskPressure 37m kubelet Node node1 status is now: NodeHasNoDiskPressure
Normal NodeHasSufficientPID 37m kubelet Node node1 status is now: NodeHasSufficientPID
Normal NodeAllocatableEnforced 37m kubelet Updated Node Allocatable limit across pods
Normal NodeReady 37m kubelet Node node1 status is now: NodeReady
Normal NodeNotReady 34m kubelet Node node1 status is now: NodeNotReady
Normal Starting 31m kubelet Starting kubelet.
Normal NodeHasSufficientMemory 31m kubelet Node node1 status is now: NodeHasSufficientMemory
Normal NodeHasNoDiskPressure 31m kubelet Node node1 status is now: NodeHasNoDiskPressure
Normal NodeHasSufficientPID 31m kubelet Node node1 status is now: NodeHasSufficientPID
Normal NodeAllocatableEnforced 31m kubelet Updated Node Allocatable limit across pods
Normal NodeReady 9m29s (x2 over 31m) kubelet Node node1 status is now: NodeReady
Normal NodeNotReady 6m28s (x2 over 28m) kubelet Node node1 status is now: NodeNotReady -
安装NVIDIA仓库配置包(适用于CentOS 8)
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.reposudo dnf install -y nvidia-container-toolkit
-
nvidia-ctk runtime configure --runtime=docker
-
kubectl get pods --all-namespaces -o json | jq -r '.items[] | select(.spec.containers[].resources.limits["<gpu-resource-name>"] != null) | .metadata.namespace + " " + .metadata.name'
-
2025/10/13 02:31:43 Starting FS watcher.
2025/10/13 02:31:43 Starting OS watcher.
2025/10/13 02:31:43 Starting Plugins.
2025/10/13 02:31:43 Loading configuration.
2025/10/13 02:31:43 Initializing NVML.
2025/10/13 02:31:43 Failed to initialize NVML: could not load NVML library.
2025/10/13 02:31:43 If this is a GPU node, did you set the docker default runtime tonvidia?
2025/10/13 02:31:43 You can check the prerequisites at: https://github.com/NVIDIA/k8s-device-plugin#prerequisites
2025/10/13 02:31:43 You can learn how to set the runtime at: https://github.com/NVIDIA/k8s-device-plugin#quick-start
2025/10/13 02:31:43 If this is not a GPU node, you should set up a toleration or nodeSelector to only deploy this plugin on GPU nodes -
-
2025/10/13 07:23:31 Failed to initialize NVML: could not load NVML library.
2025/10/13 07:23:31 If this is a GPU node, did you set the docker default runtime tonvidia? -
-
-
Failed to initialize NVML: could not load NVML library
-
-
-
$ curl -s -L https://nvidia.github.io/nvidia-docker/centos8/nvidia-docker.repo | sudo tee /etc/yum.repos.d/nvidia-docker.repo
yum install -y nvidia-container-toolkit -
d?????????? ? ? ? ? ? cephfs_data
-
root
Tzzs@2025*
220.185.228.106
30001 -
-
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/8a6aa1f8-0ab1-4518-bc22-06390d1c7494/abort" -H "accept: application/json"
-
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/78cf44c7-8a10-45e7-b518-dda45a05c7ad/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/989090fd-143a-47b8-b967-389a427d75a9/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/42345c0e-542d-433d-971b-fd1398e5831f/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/d21d8ffc-526e-4776-9632-401438be25c0/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/c13bdcc8-76d3-48a9-acbc-feb1842f1b95/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/222d2b48-0974-4cb7-8629-a0daf49d1177/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/8a6aa1f8-0ab1-4518-bc22-06390d1c7494/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/06dedc34-5518-453f-8dea-d922f793d329/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/dbdcb701-f9a8-4e87-8b6f-17b96e612e4b/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/2b427cce-9d1f-4522-8482-f4e55e7cad3f/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/e18a9ee9-23cf-42e1-8383-a632908a5f23/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/e01a39d8-6140-4fae-85c8-547a15d2f99f/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/9d43ed37-0c46-49f6-8b5b-17a75a392ece/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/975b5959-6cce-4c2e-8237-432dcf0451a7/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/3f1fcf19-bf03-489a-a317-9f0452376af3/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/3e58f4b4-6f2e-45d0-bfb2-585acfd0529f/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/925fe273-25fb-43d4-93b3-56b752b40053/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/cad4787c-baa4-4520-ba74-e244aceccd29/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/b0b21bcc-61de-417d-8a49-29fffbd35f40/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/fd4b9685-6495-4cd3-b1ff-7af3bf14bba0/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/d5c5d280-692d-47d4-9db0-edbed55dc893/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/fc85a28b-63e7-4f3e-ac19-282da4ce74b7/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/164adb9e-9f72-43ab-a22d-90d5fa0469c5/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/f76b822a-85f7-47c0-b285-a06ba61f619c/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/0c7eee09-ab69-4d9f-9979-1f3d6ec0513c/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/cbd712e0-1861-4c15-b5d5-1bd793b5c5c3/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/dfaacb27-0b44-44c3-9664-108cdc11a076/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/14606e5c-9399-4c8a-9908-3898a9caca73/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/add7f4d1-2cb7-463a-bd4a-3dee1fb7dac6/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/f8f8eb9c-077a-440d-922d-82ae0ff8109d/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/920fdb9a-816e-4fcf-bed2-14b5ba333d7a/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/a4caf18d-1795-4093-aeae-2468a325f771/abort" -H "accept: application/json"
curl -X POST "http://192.168.30.202:31237/api/workflows/v1/f44a01dc-ac87-4a51-9598-91873643a3c0/abort" -H "accept: application/json"