From 7ff9554bb578ba02166071d2d487b7fc7d860d62 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 3 May 2012 02:29:13 +0200 Subject: printk: convert byte-buffer to variable-length record buffer - Record-based stream instead of the traditional byte stream buffer. All records carry a 64 bit timestamp, the syslog facility and priority in the record header. - Records consume almost the same amount, sometimes less memory than the traditional byte stream buffer (if printk_time is enabled). The record header is 16 bytes long, plus some padding bytes at the end if needed. The byte-stream buffer needed 3 chars for the syslog prefix, 15 char for the timestamp and a newline. - Buffer management is based on message sequence numbers. When records need to be discarded, the reading heads move on to the next full record. Unlike the byte-stream buffer, no old logged lines get truncated or partly overwritten by new ones. Sequence numbers also allow consumers of the log stream to get notified if any message in the stream they are about to read gets discarded during the time of reading. - Better buffered IO support for KERN_CONT continuation lines, when printk() is called multiple times for a single line. The use of KERN_CONT is now mandatory to use continuation; a few places in the kernel need trivial fixes here. The buffering could possibly be extended to per-cpu variables to allow better thread-safety for multiple printk() invocations for a single line. - Full-featured syslog facility value support. Different facilities can tag their messages. All userspace-injected messages enforce a facility value > 0 now, to be able to reliably distinguish them from the kernel-generated messages. Independent subsystems like a baseband processor running its own firmware, or a kernel-related userspace process can use their own unique facility values. Multiple independent log streams can co-exist that way in the same buffer. All share the same global sequence number counter to ensure proper ordering (and interleaving) and to allow the consumers of the log to reliably correlate the events from different facilities. Tested-by: William Douglas Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 55 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 38 insertions(+), 17 deletions(-) (limited to 'drivers/char') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index d6e9d081c8b..cf56614479e 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -810,33 +810,54 @@ static const struct file_operations oldmem_fops = { static ssize_t kmsg_writev(struct kiocb *iocb, const struct iovec *iv, unsigned long count, loff_t pos) { - char *line, *p; + char *buf, *line; int i; - ssize_t ret = -EFAULT; + int level = default_message_loglevel; + int facility = 1; /* LOG_USER */ size_t len = iov_length(iv, count); + ssize_t ret = len; - line = kmalloc(len + 1, GFP_KERNEL); - if (line == NULL) + if (len > 1024) + return -EINVAL; + buf = kmalloc(len+1, GFP_KERNEL); + if (buf == NULL) return -ENOMEM; - /* - * copy all vectors into a single string, to ensure we do - * not interleave our log line with other printk calls - */ - p = line; + line = buf; for (i = 0; i < count; i++) { - if (copy_from_user(p, iv[i].iov_base, iv[i].iov_len)) + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) goto out; - p += iv[i].iov_len; + line += iv[i].iov_len; + } + + /* + * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace + * the decimal value represents 32bit, the lower 3 bit are the log + * level, the rest are the log facility. + * + * If no prefix or no userspace facility is specified, we + * enforce LOG_USER, to be able to reliably distinguish + * kernel-generated messages from userspace-injected ones. + */ + line = buf; + if (line[0] == '<') { + char *endp = NULL; + + i = simple_strtoul(line+1, &endp, 10); + if (endp && endp[0] == '>') { + level = i & 7; + if (i >> 3) + facility = i >> 3; + endp++; + len -= endp - line; + line = endp; + } } - p[0] = '\0'; + line[len] = '\0'; - ret = printk("%s", line); - /* printk can add a prefix */ - if (ret > len) - ret = len; + printk_emit(facility, level, NULL, 0, "%s", line); out: - kfree(line); + kfree(buf); return ret; } -- cgit v1.2.3-70-g09d2 From e11fea92e13fb91c50bacca799a6131c81929986 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Thu, 3 May 2012 02:29:41 +0200 Subject: kmsg: export printk records to the /dev/kmsg interface Support for multiple concurrent readers of /dev/kmsg, with read(), seek(), poll() support. Output of message sequence numbers, to allow userspace log consumers to reliably reconnect and reconstruct their state at any given time. After open("/dev/kmsg"), read() always returns *all* buffered records. If only future messages should be read, SEEK_END can be used. In case records get overwritten while /dev/kmsg is held open, or records get faster overwritten than they are read, the next read() will return -EPIPE and the current reading position gets updated to the next available record. The passed sequence numbers allow the log consumer to calculate the amount of lost messages. [root@mop ~]# cat /dev/kmsg 5,0,0;Linux version 3.4.0-rc1+ (kay@mop) (gcc version 4.7.0 20120315 ... 6,159,423091;ACPI: PCI Root Bridge [PCI0] (domain 0000 [bus 00-ff]) 7,160,424069;pci_root PNP0A03:00: host bridge window [io 0x0000-0x0cf7] (ignored) SUBSYSTEM=acpi DEVICE=+acpi:PNP0A03:00 6,339,5140900;NET: Registered protocol family 10 30,340,5690716;udevd[80]: starting version 181 6,341,6081421;FDC 0 is a S82078B 6,345,6154686;microcode: CPU0 sig=0x623, pf=0x0, revision=0x0 7,346,6156968;sr 1:0:0:0: Attached scsi CD-ROM sr0 SUBSYSTEM=scsi DEVICE=+scsi:1:0:0:0 6,347,6289375;microcode: CPU1 sig=0x623, pf=0x0, revision=0x0 Cc: Karel Zak Tested-by: William Douglas Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 61 +--------- include/linux/printk.h | 2 + kernel/printk.c | 313 +++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 316 insertions(+), 60 deletions(-) (limited to 'drivers/char') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index cf56614479e..0e7fbfcbd6e 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -807,65 +807,6 @@ static const struct file_operations oldmem_fops = { }; #endif -static ssize_t kmsg_writev(struct kiocb *iocb, const struct iovec *iv, - unsigned long count, loff_t pos) -{ - char *buf, *line; - int i; - int level = default_message_loglevel; - int facility = 1; /* LOG_USER */ - size_t len = iov_length(iv, count); - ssize_t ret = len; - - if (len > 1024) - return -EINVAL; - buf = kmalloc(len+1, GFP_KERNEL); - if (buf == NULL) - return -ENOMEM; - - line = buf; - for (i = 0; i < count; i++) { - if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) - goto out; - line += iv[i].iov_len; - } - - /* - * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace - * the decimal value represents 32bit, the lower 3 bit are the log - * level, the rest are the log facility. - * - * If no prefix or no userspace facility is specified, we - * enforce LOG_USER, to be able to reliably distinguish - * kernel-generated messages from userspace-injected ones. - */ - line = buf; - if (line[0] == '<') { - char *endp = NULL; - - i = simple_strtoul(line+1, &endp, 10); - if (endp && endp[0] == '>') { - level = i & 7; - if (i >> 3) - facility = i >> 3; - endp++; - len -= endp - line; - line = endp; - } - } - line[len] = '\0'; - - printk_emit(facility, level, NULL, 0, "%s", line); -out: - kfree(buf); - return ret; -} - -static const struct file_operations kmsg_fops = { - .aio_write = kmsg_writev, - .llseek = noop_llseek, -}; - static const struct memdev { const char *name; umode_t mode; @@ -884,7 +825,7 @@ static const struct memdev { [7] = { "full", 0666, &full_fops, NULL }, [8] = { "random", 0666, &random_fops, NULL }, [9] = { "urandom", 0666, &urandom_fops, NULL }, - [11] = { "kmsg", 0, &kmsg_fops, NULL }, + [11] = { "kmsg", 0644, &kmsg_fops, NULL }, #ifdef CONFIG_CRASH_DUMP [12] = { "oldmem", 0, &oldmem_fops, NULL }, #endif diff --git a/include/linux/printk.h b/include/linux/printk.h index aa3c66da105..1bec2f7a2d4 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -300,6 +300,8 @@ extern void dump_stack(void) __cold; no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif +extern const struct file_operations kmsg_fops; + enum { DUMP_PREFIX_NONE, DUMP_PREFIX_ADDRESS, diff --git a/kernel/printk.c b/kernel/printk.c index 74357329550..1ccc6d986cb 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -41,6 +41,7 @@ #include #include #include +#include #include @@ -149,6 +150,48 @@ static int console_may_schedule; * length of the message text is stored in the header, the stored message * is not terminated. * + * Optionally, a message can carry a dictionary of properties (key/value pairs), + * to provide userspace with a machine-readable message context. + * + * Examples for well-defined, commonly used property names are: + * DEVICE=b12:8 device identifier + * b12:8 block dev_t + * c127:3 char dev_t + * n8 netdev ifindex + * +sound:card0 subsystem:devname + * SUBSYSTEM=pci driver-core subsystem name + * + * Valid characters in property names are [a-zA-Z0-9.-_]. The plain text value + * follows directly after a '=' character. Every property is terminated by + * a '\0' character. The last property is not terminated. + * + * Example of a message structure: + * 0000 ff 8f 00 00 00 00 00 00 monotonic time in nsec + * 0008 34 00 record is 52 bytes long + * 000a 0b 00 text is 11 bytes long + * 000c 1f 00 dictionary is 23 bytes long + * 000e 03 00 LOG_KERN (facility) LOG_ERR (level) + * 0010 69 74 27 73 20 61 20 6c "it's a l" + * 69 6e 65 "ine" + * 001b 44 45 56 49 43 "DEVIC" + * 45 3d 62 38 3a 32 00 44 "E=b8:2\0D" + * 52 49 56 45 52 3d 62 75 "RIVER=bu" + * 67 "g" + * 0032 00 00 00 padding to next message header + * + * The 'struct log' buffer header must never be directly exported to + * userspace, it is a kernel-private implementation detail that might + * need to be changed in the future, when the requirements change. + * + * /dev/kmsg exports the structured data in the following line format: + * "level,sequnum,timestamp;\n" + * + * The optional key/value pairs are attached as continuation lines starting + * with a space character and terminated by a newline. All possible + * non-prinatable characters are escaped in the "\xff" notation. + * + * Users of the export format should ignore possible additional values + * separated by ',', and find the message after the ';' character. */ struct log { @@ -297,6 +340,276 @@ static void log_store(int facility, int level, log_next_seq++; } +/* /dev/kmsg - userspace message inject/listen interface */ +struct devkmsg_user { + u64 seq; + u32 idx; + struct mutex lock; + char buf[8192]; +}; + +static ssize_t devkmsg_writev(struct kiocb *iocb, const struct iovec *iv, + unsigned long count, loff_t pos) +{ + char *buf, *line; + int i; + int level = default_message_loglevel; + int facility = 1; /* LOG_USER */ + size_t len = iov_length(iv, count); + ssize_t ret = len; + + if (len > LOG_LINE_MAX) + return -EINVAL; + buf = kmalloc(len+1, GFP_KERNEL); + if (buf == NULL) + return -ENOMEM; + + line = buf; + for (i = 0; i < count; i++) { + if (copy_from_user(line, iv[i].iov_base, iv[i].iov_len)) + goto out; + line += iv[i].iov_len; + } + + /* + * Extract and skip the syslog prefix <[0-9]*>. Coming from userspace + * the decimal value represents 32bit, the lower 3 bit are the log + * level, the rest are the log facility. + * + * If no prefix or no userspace facility is specified, we + * enforce LOG_USER, to be able to reliably distinguish + * kernel-generated messages from userspace-injected ones. + */ + line = buf; + if (line[0] == '<') { + char *endp = NULL; + + i = simple_strtoul(line+1, &endp, 10); + if (endp && endp[0] == '>') { + level = i & 7; + if (i >> 3) + facility = i >> 3; + endp++; + len -= endp - line; + line = endp; + } + } + line[len] = '\0'; + + printk_emit(facility, level, NULL, 0, "%s", line); +out: + kfree(buf); + return ret; +} + +static ssize_t devkmsg_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + struct devkmsg_user *user = file->private_data; + struct log *msg; + size_t i; + size_t len; + ssize_t ret; + + if (!user) + return -EBADF; + + mutex_lock(&user->lock); + raw_spin_lock(&logbuf_lock); + while (user->seq == log_next_seq) { + if (file->f_flags & O_NONBLOCK) { + ret = -EAGAIN; + raw_spin_unlock(&logbuf_lock); + goto out; + } + + raw_spin_unlock(&logbuf_lock); + ret = wait_event_interruptible(log_wait, + user->seq != log_next_seq); + if (ret) + goto out; + raw_spin_lock(&logbuf_lock); + } + + if (user->seq < log_first_seq) { + /* our last seen message is gone, return error and reset */ + user->idx = log_first_idx; + user->seq = log_first_seq; + ret = -EPIPE; + raw_spin_unlock(&logbuf_lock); + goto out; + } + + msg = log_from_idx(user->idx); + len = sprintf(user->buf, "%u,%llu,%llu;", + msg->level, user->seq, msg->ts_nsec / 1000); + + /* escape non-printable characters */ + for (i = 0; i < msg->text_len; i++) { + char c = log_text(msg)[i]; + + if (c < ' ' || c >= 128) + len += sprintf(user->buf + len, "\\x%02x", c); + else + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + + if (msg->dict_len) { + bool line = true; + + for (i = 0; i < msg->dict_len; i++) { + char c = log_dict(msg)[i]; + + if (line) { + user->buf[len++] = ' '; + line = false; + } + + if (c == '\0') { + user->buf[len++] = '\n'; + line = true; + continue; + } + + if (c < ' ' || c >= 128) { + len += sprintf(user->buf + len, "\\x%02x", c); + continue; + } + + user->buf[len++] = c; + } + user->buf[len++] = '\n'; + } + + user->idx = log_next(user->idx); + user->seq++; + raw_spin_unlock(&logbuf_lock); + + if (len > count) { + ret = -EINVAL; + goto out; + } + + if (copy_to_user(buf, user->buf, len)) { + ret = -EFAULT; + goto out; + } + ret = len; +out: + mutex_unlock(&user->lock); + return ret; +} + +static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) +{ + struct devkmsg_user *user = file->private_data; + loff_t ret = 0; + + if (!user) + return -EBADF; + if (offset) + return -ESPIPE; + + raw_spin_lock(&logbuf_lock); + switch (whence) { + case SEEK_SET: + /* the first record */ + user->idx = log_first_idx; + user->seq = log_first_seq; + break; + case SEEK_DATA: + /* + * The first record after the last SYSLOG_ACTION_CLEAR, + * like issued by 'dmesg -c'. Reading /dev/kmsg itself + * changes no global state, and does not clear anything. + */ + user->idx = clear_idx; + user->seq = clear_seq; + break; + case SEEK_END: + /* after the last record */ + user->idx = log_next_idx; + user->seq = log_next_seq; + break; + default: + ret = -EINVAL; + } + raw_spin_unlock(&logbuf_lock); + return ret; +} + +static unsigned int devkmsg_poll(struct file *file, poll_table *wait) +{ + struct devkmsg_user *user = file->private_data; + int ret = 0; + + if (!user) + return POLLERR|POLLNVAL; + + poll_wait(file, &log_wait, wait); + + raw_spin_lock(&logbuf_lock); + if (user->seq < log_next_seq) { + /* return error when data has vanished underneath us */ + if (user->seq < log_first_seq) + ret = POLLIN|POLLRDNORM|POLLERR|POLLPRI; + ret = POLLIN|POLLRDNORM; + } + raw_spin_unlock(&logbuf_lock); + + return ret; +} + +static int devkmsg_open(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user; + int err; + + /* write-only does not need any file context */ + if ((file->f_flags & O_ACCMODE) == O_WRONLY) + return 0; + + err = security_syslog(SYSLOG_ACTION_READ_ALL); + if (err) + return err; + + user = kmalloc(sizeof(struct devkmsg_user), GFP_KERNEL); + if (!user) + return -ENOMEM; + + mutex_init(&user->lock); + + raw_spin_lock(&logbuf_lock); + user->idx = log_first_idx; + user->seq = log_first_seq; + raw_spin_unlock(&logbuf_lock); + + file->private_data = user; + return 0; +} + +static int devkmsg_release(struct inode *inode, struct file *file) +{ + struct devkmsg_user *user = file->private_data; + + if (!user) + return 0; + + mutex_destroy(&user->lock); + kfree(user); + return 0; +} + +const struct file_operations kmsg_fops = { + .open = devkmsg_open, + .read = devkmsg_read, + .aio_write = devkmsg_writev, + .llseek = devkmsg_llseek, + .poll = devkmsg_poll, + .release = devkmsg_release, +}; + #ifdef CONFIG_KEXEC /* * This appends the listed symbols to /proc/vmcoreinfo -- cgit v1.2.3-70-g09d2 From 7f3a781d6fd81e397c3928c9af33f1fc63232db6 Mon Sep 17 00:00:00 2001 From: Kay Sievers Date: Wed, 9 May 2012 01:37:51 +0200 Subject: printk - fix compilation for CONFIG_PRINTK=n Reported-by: Randy Dunlap Signed-off-by: Kay Sievers Signed-off-by: Greg Kroah-Hartman --- drivers/char/mem.c | 2 ++ kernel/printk.c | 41 ++++++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 19 deletions(-) (limited to 'drivers/char') diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 0e7fbfcbd6e..67c3371723c 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -825,7 +825,9 @@ static const struct memdev { [7] = { "full", 0666, &full_fops, NULL }, [8] = { "random", 0666, &random_fops, NULL }, [9] = { "urandom", 0666, &urandom_fops, NULL }, +#ifdef CONFIG_PRINTK [11] = { "kmsg", 0644, &kmsg_fops, NULL }, +#endif #ifdef CONFIG_CRASH_DUMP [12] = { "oldmem", 0, &oldmem_fops, NULL }, #endif diff --git a/kernel/printk.c b/kernel/printk.c index 96d4cc89225..7b432951f91 100644 --- a/kernel/printk.c +++ b/kernel/printk.c @@ -126,7 +126,6 @@ EXPORT_SYMBOL(console_set_on_cmdline); /* Flag: console code may call schedule() */ static int console_may_schedule; -#ifdef CONFIG_PRINTK /* * The printk log buffer consists of a chain of concatenated variable * length records. Every record starts with a record header, containing @@ -208,16 +207,9 @@ struct log { */ static DEFINE_RAW_SPINLOCK(logbuf_lock); -/* cpu currently holding logbuf_lock */ -static volatile unsigned int logbuf_cpu = UINT_MAX; - -#define LOG_LINE_MAX 1024 - -/* record buffer */ -#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) -static char __log_buf[__LOG_BUF_LEN]; -static char *log_buf = __log_buf; -static u32 log_buf_len = __LOG_BUF_LEN; +/* the next printk record to read by syslog(READ) or /proc/kmsg */ +static u64 syslog_seq; +static u32 syslog_idx; /* index and sequence number of the first record stored in the buffer */ static u64 log_first_seq; @@ -225,15 +217,23 @@ static u32 log_first_idx; /* index and sequence number of the next record to store in the buffer */ static u64 log_next_seq; +#ifdef CONFIG_PRINTK static u32 log_next_idx; /* the next printk record to read after the last 'clear' command */ static u64 clear_seq; static u32 clear_idx; -/* the next printk record to read by syslog(READ) or /proc/kmsg */ -static u64 syslog_seq; -static u32 syslog_idx; +#define LOG_LINE_MAX 1024 + +/* record buffer */ +#define __LOG_BUF_LEN (1 << CONFIG_LOG_BUF_SHIFT) +static char __log_buf[__LOG_BUF_LEN]; +static char *log_buf = __log_buf; +static u32 log_buf_len = __LOG_BUF_LEN; + +/* cpu currently holding logbuf_lock */ +static volatile unsigned int logbuf_cpu = UINT_MAX; /* human readable text of the record */ static char *log_text(const struct log *msg) @@ -1425,13 +1425,16 @@ asmlinkage int printk(const char *fmt, ...) return r; } EXPORT_SYMBOL(printk); + #else -static void call_console_drivers(int level, const char *text, size_t len) -{ -} +#define LOG_LINE_MAX 0 +static struct log *log_from_idx(u32 idx) { return NULL; } +static u32 log_next(u32 idx) { return 0; } +static char *log_text(const struct log *msg) { return NULL; } +static void call_console_drivers(int level, const char *text, size_t len) {} -#endif +#endif /* CONFIG_PRINTK */ static int __add_preferred_console(char *name, int idx, char *options, char *brl_options) @@ -1715,7 +1718,7 @@ static u32 console_idx; * by printk(). If this is the case, console_unlock(); emits * the output prior to releasing the lock. * - * If there is output waiting, we wake it /dev/kmsg and syslog() users. + * If there is output waiting, we wake /dev/kmsg and syslog() users. * * console_unlock(); may be called from any context. */ -- cgit v1.2.3-70-g09d2