Mom logs stuff like that:
12/07/2007 00:04:10;0001; pbs_mom;Svr;pbs_mom;Success (0) in
cput_sum, 7058: get_proc_stat
the mom tries to parse that line in the following way (from
torque-2.3.0-snap.200712061242/src/resmom/linux/mom_mach.c):
fscanf(fd,"%d (%[^)]) %c %d %d %d
That will probably break on parsing the '(ib_fmr(mthca0))', because it
will assume the first ')' is the closing bracket. Which is just not true.
'man 5 proc' suggests to use '%s', but that will be even worse than the
current '%[^)]', breaking on every executable name that contains a
space. And what if someone wants run a monster like the following:
6849 (te (s)( ))t)) S 25614 6849 25614 34838 6849 4194304 161 0 0 0 0 0
0 0 20 0 1 0 36168980 2564096 77 18446744073709551615 4194304 4195956
140736421683184 18446744073709551615 47252866936498 0 0 0 0 0 0 0 17 0 0
0 0
The only proper fix would probably be to look for the last ')' in the
whole string.
And here's my suggestion for a patch. Patchfile is against torque 2.2.1.
--
Michael Meier, HPC Services
Friedrich-Alexander-Universitaet Erlangen-Nuernberg
Regionales Rechenzentrum Erlangen
Martensstrasse 1, 91058 Erlangen, Germany
Tel.: +49 9131 85-28973, Fax: +49 9131 302941
michael.meier@xxxxxxxxxxxxxxxxxxxx
www.rrze.uni-erlangen.de/hpc/
--- /tmp/torque-2.2.1/src/resmom/linux/mom_mach.c 2007-11-02
19:30:45.000000000 +0100
+++ /home/rzsunhome/unrz/unrz191/mom_mach_fixed.c 2007-12-07
18:13:40.795601000 +0100
@@ -282,11 +282,11 @@
/* NOTE: leading '*' indicates that field should be ignored */
/* FORMAT: <PID> <COMM> <STATE> <PPID> <PGRP> <SESSION> [<TTY_NR>] [<TPGID>]
<FLAGS> [<MINFLT>] [<CMINFLT>] [<MAJFLT>] [<CMAJFLT>] <UTIME> <STIME> <CUTIME>
<CSTIME> [<PRIORITY>] [<NICE>] [<0>] [<ITREALVALUE>] <STARTTIME> <VSIZE> <RSS>
[<RLIM>] [<STARTCODE>] ... */
-static char stat_str[] = "%d (%[^)]) %c %d %d %d %*d %*d %u %*u \
+static char stat_str[] = " %c %d %d %d %*d %*d %u %*u \
%*u %*u %*u %lu %lu %lu %lu %*ld %*ld %*u %*ld %lu %llu %lld %*lu %*lu \
%*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu %*lu";
/*
* Convert jiffies to seconds.
@@ -310,10 +310,12 @@
int pid) /* I */
{
static proc_stat_t ps;
static char path[1024];
+ static char readbuf[4096];
+ static char *lastbracket;
FILE *fd;
unsigned long jstarttime; /* number of jiffies since OS start time
when process started */
struct stat sb;
static int Hertz = 0;
@@ -348,15 +350,32 @@
return(NULL);
}
/* use 'man 5 proc' for /proc/pid/stat format */
- /* see stat_str[] value for mapping 'stat' format */
+ if (!fgets(readbuf, sizeof(readbuf), fd)) {
+ fclose(fd);
+ return(NULL);
+ }
+
+ lastbracket = strrchr(readbuf, ')');
+ if (lastbracket == NULL) {
+ fclose(fd);
+ return(NULL);
+ }
+ *lastbracket = '\0'; /* We basically split the string here, overwriting the
')'. */
+ lastbracket++;
+ if (sscanf(readbuf, "%d (%[^\n]", &ps.pid, path) != 2) {
+ /* FAILURE */
- if (fscanf(fd,stat_str,
- &ps.pid, /* PID */
- path, /* exe */
+ fclose(fd);
+
+ return(NULL);
+ }
+
+ /* see stat_str[] value for mapping 'stat' format */
+ if (sscanf(lastbracket,stat_str,
&ps.state, /* state (one of RSDZTW) */
&ps.ppid, /* ppid */
&ps.pgrp, /* pgrp */
&ps.session, /* session id */
&ps.flags, /* flags - kernel flags of the process, see the PF_* in
<linux/sched.h> */
@@ -364,11 +383,11 @@
&ps.stime, /* stime - jiffies that this process has been scheduled
in kernel mode */
&ps.cutime, /* cutime - jiffies that this processâ??s waited-for
children have been scheduled in user mode */
&ps.cstime, /* cstime - jiffies that this processâ??s waited-for
children have been scheduled in kernel mode */
&jstarttime, /* starttime */
&ps.vsize, /* vsize */
- &ps.rss) != 14) /* rss */
+ &ps.rss) != 12) /* rss */
{
/* FAILURE */
fclose(fd);
_______________________________________________
torqueusers mailing list
torqueusers@xxxxxxxxxxxxxxxx
http://www.supercluster.org/mailman/listinfo/torqueusers
|