aboutsummaryrefslogtreecommitdiffstats
path: root/Documentation
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation')
-rw-r--r--Documentation/Changes1
-rw-r--r--Documentation/DMA-API.txt79
-rw-r--r--Documentation/DocBook/kernel-api.tmpl19
-rw-r--r--Documentation/DocBook/uio-howto.tmpl4
-rw-r--r--Documentation/dontdiff3
-rw-r--r--Documentation/fb/pvr2fb.txt22
-rw-r--r--Documentation/feature-removal-schedule.txt16
-rw-r--r--Documentation/filesystems/hfsplus.txt59
-rw-r--r--Documentation/hpet.txt2
-rw-r--r--Documentation/hwmon/adm10314
-rw-r--r--Documentation/hwmon/thmc5074
-rw-r--r--Documentation/i386/zero-page.txt10
-rw-r--r--Documentation/ja_JP/HOWTO66
-rw-r--r--Documentation/ja_JP/stable_api_nonsense.txt20
-rw-r--r--Documentation/kbuild/kconfig-language.txt9
-rw-r--r--Documentation/kernel-parameters.txt147
-rw-r--r--Documentation/keys.txt5
-rw-r--r--Documentation/kobject.txt178
-rw-r--r--Documentation/lguest/Makefile4
-rw-r--r--Documentation/lguest/extract58
-rw-r--r--Documentation/lguest/lguest.c620
-rw-r--r--Documentation/memory-hotplug.txt322
-rw-r--r--Documentation/sched-design-CFS.txt2
-rw-r--r--Documentation/sched-nice-design.txt108
-rw-r--r--Documentation/sched-stats.txt195
-rw-r--r--Documentation/spi/spidev_test.c202
-rw-r--r--Documentation/stable_api_nonsense.txt2
-rw-r--r--Documentation/sysfs-rules.txt72
-rw-r--r--Documentation/sysrq.txt4
-rw-r--r--Documentation/thinkpad-acpi.txt4
-rw-r--r--Documentation/vm/slabinfo.c2
31 files changed, 1809 insertions, 504 deletions
diff --git a/Documentation/Changes b/Documentation/Changes
index 73a8617f1861..cb2b141b1c3e 100644
--- a/Documentation/Changes
+++ b/Documentation/Changes
@@ -45,6 +45,7 @@ o nfs-utils 1.0.5 # showmount --version
o procps 3.2.0 # ps --version
o oprofile 0.9 # oprofiled --version
o udev 081 # udevinfo -V
+o grub 0.93 # grub --version
Kernel compilation
==================
diff --git a/Documentation/DMA-API.txt b/Documentation/DMA-API.txt
index 805db4b2cba6..cc7a8c39fb6f 100644
--- a/Documentation/DMA-API.txt
+++ b/Documentation/DMA-API.txt
@@ -26,7 +26,7 @@ Part Ia - Using large dma-coherent buffers
void *
dma_alloc_coherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, int flag)
+ dma_addr_t *dma_handle, gfp_t flag)
void *
pci_alloc_consistent(struct pci_dev *dev, size_t size,
dma_addr_t *dma_handle)
@@ -38,7 +38,7 @@ to make sure to flush the processor's write buffers before telling
devices to read that memory.)
This routine allocates a region of <size> bytes of consistent memory.
-it also returns a <dma_handle> which may be cast to an unsigned
+It also returns a <dma_handle> which may be cast to an unsigned
integer the same width as the bus and used as the physical address
base of the region.
@@ -52,21 +52,21 @@ The simplest way to do that is to use the dma_pool calls (see below).
The flag parameter (dma_alloc_coherent only) allows the caller to
specify the GFP_ flags (see kmalloc) for the allocation (the
-implementation may chose to ignore flags that affect the location of
+implementation may choose to ignore flags that affect the location of
the returned memory, like GFP_DMA). For pci_alloc_consistent, you
must assume GFP_ATOMIC behaviour.
void
-dma_free_coherent(struct device *dev, size_t size, void *cpu_addr
+dma_free_coherent(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_handle)
void
-pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr
+pci_free_consistent(struct pci_dev *dev, size_t size, void *cpu_addr,
dma_addr_t dma_handle)
Free the region of consistent memory you previously allocated. dev,
size and dma_handle must all be the same as those passed into the
consistent allocate. cpu_addr must be the virtual address returned by
-the consistent allocate
+the consistent allocate.
Part Ib - Using small dma-coherent buffers
@@ -77,9 +77,9 @@ To get this part of the dma_ API, you must #include <linux/dmapool.h>
Many drivers need lots of small dma-coherent memory regions for DMA
descriptors or I/O buffers. Rather than allocating in units of a page
or more using dma_alloc_coherent(), you can use DMA pools. These work
-much like a struct kmem_cache, except that they use the dma-coherent allocator
+much like a struct kmem_cache, except that they use the dma-coherent allocator,
not __get_free_pages(). Also, they understand common hardware constraints
-for alignment, like queue heads needing to be aligned on N byte boundaries.
+for alignment, like queue heads needing to be aligned on N-byte boundaries.
struct dma_pool *
@@ -102,15 +102,15 @@ crossing restrictions, pass 0 for alloc; passing 4096 says memory allocated
from this pool must not cross 4KByte boundaries.
- void *dma_pool_alloc(struct dma_pool *pool, int gfp_flags,
+ void *dma_pool_alloc(struct dma_pool *pool, gfp_t gfp_flags,
dma_addr_t *dma_handle);
- void *pci_pool_alloc(struct pci_pool *pool, int gfp_flags,
+ void *pci_pool_alloc(struct pci_pool *pool, gfp_t gfp_flags,
dma_addr_t *dma_handle);
This allocates memory from the pool; the returned memory will meet the size
and alignment requirements specified at creation time. Pass GFP_ATOMIC to
-prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks)
+prevent blocking, or if it's permitted (not in_interrupt, not holding SMP locks),
pass GFP_KERNEL to allow blocking. Like dma_alloc_coherent(), this returns
two values: an address usable by the cpu, and the dma address usable by the
pool's device.
@@ -123,7 +123,7 @@ pool's device.
dma_addr_t addr);
This puts memory back into the pool. The pool is what was passed to
-the pool allocation routine; the cpu and dma addresses are what
+the pool allocation routine; the cpu (vaddr) and dma addresses are what
were returned when that routine allocated the memory being freed.
@@ -209,18 +209,18 @@ Notes: Not all memory regions in a machine can be mapped by this
API. Further, regions that appear to be physically contiguous in
kernel virtual space may not be contiguous as physical memory. Since
this API does not provide any scatter/gather capability, it will fail
-if the user tries to map a non physically contiguous piece of memory.
+if the user tries to map a non-physically contiguous piece of memory.
For this reason, it is recommended that memory mapped by this API be
-obtained only from sources which guarantee to be physically contiguous
+obtained only from sources which guarantee it to be physically contiguous
(like kmalloc).
Further, the physical address of the memory must be within the
dma_mask of the device (the dma_mask represents a bit mask of the
-addressable region for the device. i.e. if the physical address of
+addressable region for the device. I.e., if the physical address of
the memory anded with the dma_mask is still equal to the physical
address, then the device can perform DMA to the memory). In order to
ensure that the memory allocated by kmalloc is within the dma_mask,
-the driver may specify various platform dependent flags to restrict
+the driver may specify various platform-dependent flags to restrict
the physical memory range of the allocation (e.g. on x86, GFP_DMA
guarantees to be within the first 16Mb of available physical memory,
as required by ISA devices).
@@ -244,14 +244,14 @@ are guaranteed also to be cache line boundaries).
DMA_TO_DEVICE synchronisation must be done after the last modification
of the memory region by the software and before it is handed off to
-the driver. Once this primitive is used. Memory covered by this
-primitive should be treated as read only by the device. If the device
+the driver. Once this primitive is used, memory covered by this
+primitive should be treated as read-only by the device. If the device
may write to it at any point, it should be DMA_BIDIRECTIONAL (see
below).
DMA_FROM_DEVICE synchronisation must be done before the driver
accesses data that may be changed by the device. This memory should
-be treated as read only by the driver. If the driver needs to write
+be treated as read-only by the driver. If the driver needs to write
to it at any point, it should be DMA_BIDIRECTIONAL (see below).
DMA_BIDIRECTIONAL requires special handling: it means that the driver
@@ -261,7 +261,7 @@ you must always sync bidirectional memory twice: once before the
memory is handed off to the device (to make sure all memory changes
are flushed from the processor) and once before the data may be
accessed after being used by the device (to make sure any processor
-cache lines are updated with data that the device may have changed.
+cache lines are updated with data that the device may have changed).
void
dma_unmap_single(struct device *dev, dma_addr_t dma_addr, size_t size,
@@ -302,8 +302,8 @@ pci_dma_mapping_error(dma_addr_t dma_addr)
In some circumstances dma_map_single and dma_map_page will fail to create
a mapping. A driver can check for these errors by testing the returned
-dma address with dma_mapping_error(). A non zero return value means the mapping
-could not be created and the driver should take appropriate action (eg
+dma address with dma_mapping_error(). A non-zero return value means the mapping
+could not be created and the driver should take appropriate action (e.g.
reduce current DMA mapping usage or delay and try again later).
int
@@ -315,7 +315,7 @@ reduce current DMA mapping usage or delay and try again later).
Maps a scatter gather list from the block layer.
-Returns: the number of physical segments mapped (this may be shorted
+Returns: the number of physical segments mapped (this may be shorter
than <nents> passed in if the block layer determines that some
elements of the scatter/gather list are physically adjacent and thus
may be mapped with a single entry).
@@ -357,7 +357,7 @@ accessed sg->address and sg->length as shown above.
pci_unmap_sg(struct pci_dev *hwdev, struct scatterlist *sg,
int nents, int direction)
-unmap the previously mapped scatter/gather list. All the parameters
+Unmap the previously mapped scatter/gather list. All the parameters
must be the same as those and passed in to the scatter/gather mapping
API.
@@ -377,7 +377,7 @@ void
pci_dma_sync_sg(struct pci_dev *hwdev, struct scatterlist *sg,
int nelems, int direction)
-synchronise a single contiguous or scatter/gather mapping. All the
+Synchronise a single contiguous or scatter/gather mapping. All the
parameters must be the same as those passed into the single mapping
API.
@@ -406,7 +406,7 @@ API at all.
void *
dma_alloc_noncoherent(struct device *dev, size_t size,
- dma_addr_t *dma_handle, int flag)
+ dma_addr_t *dma_handle, gfp_t flag)
Identical to dma_alloc_coherent() except that the platform will
choose to return either consistent or non-consistent memory as it sees
@@ -426,34 +426,34 @@ void
dma_free_noncoherent(struct device *dev, size_t size, void *cpu_addr,
dma_addr_t dma_handle)
-free memory allocated by the nonconsistent API. All parameters must
+Free memory allocated by the nonconsistent API. All parameters must
be identical to those passed in (and returned by
dma_alloc_noncoherent()).
int
dma_is_consistent(struct device *dev, dma_addr_t dma_handle)
-returns true if the device dev is performing consistent DMA on the memory
+Returns true if the device dev is performing consistent DMA on the memory
area pointed to by the dma_handle.
int
dma_get_cache_alignment(void)
-returns the processor cache alignment. This is the absolute minimum
+Returns the processor cache alignment. This is the absolute minimum
alignment *and* width that you must observe when either mapping
memory or doing partial flushes.
Notes: This API may return a number *larger* than the actual cache
line, but it will guarantee that one or more cache lines fit exactly
into the width returned by this call. It will also always be a power
-of two for easy alignment
+of two for easy alignment.
void
dma_sync_single_range(struct device *dev, dma_addr_t dma_handle,
unsigned long offset, size_t size,
enum dma_data_direction direction)
-does a partial sync. starting at offset and continuing for size. You
+Does a partial sync, starting at offset and continuing for size. You
must be careful to observe the cache alignment and width when doing
anything like this. You must also be extra careful about accessing
memory you intend to sync partially.
@@ -472,21 +472,20 @@ dma_declare_coherent_memory(struct device *dev, dma_addr_t bus_addr,
dma_addr_t device_addr, size_t size, int
flags)
-
Declare region of memory to be handed out by dma_alloc_coherent when
it's asked for coherent memory for this device.
bus_addr is the physical address to which the memory is currently
assigned in the bus responding region (this will be used by the
-platform to perform the mapping)
+platform to perform the mapping).
device_addr is the physical address the device needs to be programmed
with actually to address this memory (this will be handed out as the
-dma_addr_t in dma_alloc_coherent())
+dma_addr_t in dma_alloc_coherent()).
size is the size of the area (must be multiples of PAGE_SIZE).
-flags can be or'd together and are
+flags can be or'd together and are:
DMA_MEMORY_MAP - request that the memory returned from
dma_alloc_coherent() be directly writable.
@@ -494,7 +493,7 @@ dma_alloc_coherent() be directly writable.
DMA_MEMORY_IO - request that the memory returned from
dma_alloc_coherent() be addressable using read/write/memcpy_toio etc.
-One or both of these flags must be present
+One or both of these flags must be present.
DMA_MEMORY_INCLUDES_CHILDREN - make the declared memory be allocated by
dma_alloc_coherent of any child devices of this one (for memory residing
@@ -528,7 +527,7 @@ dma_release_declared_memory(struct device *dev)
Remove the memory region previously declared from the system. This
API performs *no* in-use checking for this region and will return
unconditionally having removed all the required structures. It is the
-drivers job to ensure that no parts of this memory region are
+driver's job to ensure that no parts of this memory region are
currently in use.
void *
@@ -538,12 +537,10 @@ dma_mark_declared_memory_occupied(struct device *dev,
This is used to occupy specific regions of the declared space
(dma_alloc_coherent() will hand out the first free region it finds).
-device_addr is the *device* address of the region requested
+device_addr is the *device* address of the region requested.
-size is the size (and should be a page sized multiple).
+size is the size (and should be a page-sized multiple).
The return value will be either a pointer to the processor virtual
address of the memory, or an error (via PTR_ERR()) if any part of the
region is occupied.
-
-
diff --git a/Documentation/DocBook/kernel-api.tmpl b/Documentation/DocBook/kernel-api.tmpl
index eb42bf9847cb..b886f52a9aac 100644
--- a/Documentation/DocBook/kernel-api.tmpl
+++ b/Documentation/DocBook/kernel-api.tmpl
@@ -380,7 +380,6 @@ X!Edrivers/base/interface.c
!Edrivers/base/bus.c
</sect1>
<sect1><title>Device Drivers Power Management</title>
-!Edrivers/base/power/main.c
!Edrivers/base/power/resume.c
!Edrivers/base/power/suspend.c
</sect1>
@@ -398,12 +397,12 @@ X!Edrivers/acpi/pci_bind.c
-->
</sect1>
<sect1><title>Device drivers PnP support</title>
-!Edrivers/pnp/core.c
+!Idrivers/pnp/core.c
<!-- No correct structured comments
X!Edrivers/pnp/system.c
-->
!Edrivers/pnp/card.c
-!Edrivers/pnp/driver.c
+!Idrivers/pnp/driver.c
!Edrivers/pnp/manager.c
!Edrivers/pnp/support.c
</sect1>
@@ -704,14 +703,22 @@ X!Idrivers/video/console/fonts.c
<chapter id="splice">
<title>splice API</title>
- <para>)
+ <para>
splice is a method for moving blocks of data around inside the
- kernel, without continually transferring it between the kernel
+ kernel, without continually transferring them between the kernel
and user space.
</para>
-!Iinclude/linux/splice.h
!Ffs/splice.c
</chapter>
+ <chapter id="pipes">
+ <title>pipes API</title>
+ <para>
+ Pipe interfaces are all for in-kernel (builtin image) use.
+ They are not exported for use by modules.
+ </para>
+!Iinclude/linux/pipe_fs_i.h
+!Ffs/pipe.c
+ </chapter>
</book>
diff --git a/Documentation/DocBook/uio-howto.tmpl b/Documentation/DocBook/uio-howto.tmpl
index e3bb29a8d8dd..c119484258b8 100644
--- a/Documentation/DocBook/uio-howto.tmpl
+++ b/Documentation/DocBook/uio-howto.tmpl
@@ -133,10 +133,6 @@ interested in translating it, please email me
<para>updates of your driver can take place without recompiling
the kernel.</para>
</listitem>
-<listitem>
- <para>if you need to keep some parts of your driver closed source,
- you can do so without violating the GPL license on the kernel.</para>
-</listitem>
</itemizedlist>
<sect1 id="how_uio_works">
diff --git a/Documentation/dontdiff b/Documentation/dontdiff
index 595a5ea4c690..7b9551fc6fe3 100644
--- a/Documentation/dontdiff
+++ b/Documentation/dontdiff
@@ -18,6 +18,7 @@
*.moc
*.mod.c
*.o
+*.o.*
*.orig
*.out
*.pdf
@@ -163,6 +164,8 @@ raid6tables.c
relocs
series
setup
+setup.bin
+setup.elf
sim710_d.h*
sImage
sm_tbl*
diff --git a/Documentation/fb/pvr2fb.txt b/Documentation/fb/pvr2fb.txt
index 2bf6c2321c2d..36bdeff585e2 100644
--- a/Documentation/fb/pvr2fb.txt
+++ b/Documentation/fb/pvr2fb.txt
@@ -9,14 +9,13 @@ one found in the Dreamcast.
Advantages:
* It provides a nice large console (128 cols + 48 lines with 1024x768)
- without using tiny, unreadable fonts.
+ without using tiny, unreadable fonts (NOT on the Dreamcast)
* You can run XF86_FBDev on top of /dev/fb0
* Most important: boot logo :-)
Disadvantages:
- * Driver is currently limited to the Dreamcast PowerVR 2 implementation
- at the time of this writing.
+ * Driver is largely untested on non-Dreamcast systems.
Configuration
=============
@@ -29,11 +28,16 @@ Accepted options:
font:X - default font to use. All fonts are supported, including the
SUN12x22 font which is very nice at high resolutions.
-mode:X - default video mode. The following video modes are supported:
- 640x240-60, 640x480-60.
+mode:X - default video mode with format [xres]x[yres]-<bpp>@<refresh rate>
+ The following video modes are supported:
+ 640x640-16@60, 640x480-24@60, 640x480-32@60. The Dreamcast
+ defaults to 640x480-16@60. At the time of writing the
+ 24bpp and 32bpp modes function poorly. Work to fix that is
+ ongoing
+
Note: the 640x240 mode is currently broken, and should not be
- used for any reason. It is only mentioned as a reference.
+ used for any reason. It is only mentioned here as a reference.
inverse - invert colors on screen (for LCD displays)
@@ -52,10 +56,10 @@ output:X - output type. This can be any of the following: pal, ntsc, and
X11
===
-XF86_FBDev should work, in theory. At the time of this writing it is
-totally untested and may or may not even portray the beginnings of
-working. If you end up testing this, please let me know!
+XF86_FBDev has been shown to work on the Dreamcast in the past - though not yet
+on any 2.6 series kernel.
--
Paul Mundt <lethal@linuxdc.org>
+Updated by Adrian McMenamin <adrian@mcmen.demon.co.uk>
diff --git a/Documentation/feature-removal-schedule.txt b/Documentation/feature-removal-schedule.txt
index c175eedadb5f..a43d2878a4ef 100644
--- a/Documentation/feature-removal-schedule.txt
+++ b/Documentation/feature-removal-schedule.txt
@@ -211,22 +211,6 @@ Who: Richard Purdie <rpurdie@rpsys.net>
---------------------------
-What: read_dev_chars(), read_conf_data{,_lpm}() (s390 common I/O layer)
-When: December 2007
-Why: These functions are a leftover from 2.4 times. They have several
- problems:
- - Duplication of checks that are done in the device driver's
- interrupt handler
- - common I/O layer can't do device specific error recovery
- - device driver can't be notified for conditions happening during
- execution of the function
- Device drivers should issue the read device characteristics and read
- configuration data ccws and do the appropriate error handling
- themselves.
-Who: Cornelia Huck <cornelia.huck@de.ibm.com>
-
----------------------------
-
What: i2c-ixp2000, i2c-ixp4xx and scx200_i2c drivers
When: September 2007
Why: Obsolete. The new i2c-gpio driver replaces all hardware-specific
diff --git a/Documentation/filesystems/hfsplus.txt b/Documentation/filesystems/hfsplus.txt
new file mode 100644
index 000000000000..af1628a1061c
--- /dev/null
+++ b/Documentation/filesystems/hfsplus.txt
@@ -0,0 +1,59 @@
+
+Macintosh HFSPlus Filesystem for Linux
+======================================
+
+HFSPlus is a filesystem first introduced in MacOS 8.1.
+HFSPlus has several extensions to HFS, including 32-bit allocation
+blocks, 255-character unicode filenames, and file sizes of 2^63 bytes.
+
+
+Mount options
+=============
+
+When mounting an HFSPlus filesystem, the following options are accepted:
+
+ creator=cccc, type=cccc
+ Specifies the creator/type values as shown by the MacOS finder
+ used for creating new files. Default values: '????'.
+
+ uid=n, gid=n
+ Specifies the user/group that owns all files on the filesystem
+ that have uninitialized permissions structures.
+ Default: user/group id of the mounting process.
+
+ umask=n
+ Specifies the umask (in octal) used for files and directories
+ that have uninitialized permissions structures.
+ Default: umask of the mounting process.
+
+ session=n
+ Select the CDROM session to mount as HFSPlus filesystem. Defaults to
+ leaving that decision to the CDROM driver. This option will fail
+ with anything but a CDROM as underlying devices.
+
+ part=n
+ Select partition number n from the devices. This option only makes
+ sense for CDROMs because they can't be partitioned under Linux.
+ For disk devices the generic partition parsing code does this
+ for us. Defaults to not parsing the partition table at all.
+
+ decompose
+ Decompose file name characters.
+
+ nodecompose
+ Do not decompose file name characters.
+
+ force
+ Used to force write access to volumes that are marked as journalled
+ or locked. Use at your own risk.
+
+ nls=cccc
+ Encoding to use when presenting file names.
+
+
+References
+==========
+
+kernel source: <file:fs/hfsplus>
+
+Apple Technote 1150 http://developer.apple.com/technotes/tn/tn1150.html
diff --git a/Documentation/hpet.txt b/Documentation/hpet.txt
index b7a3dc38dd52..6ad52d9dad6c 100644
--- a/Documentation/hpet.txt
+++ b/Documentation/hpet.txt
@@ -5,7 +5,7 @@ for the 8254 and Real Time Clock (RTC) periodic timer functionality.
Each HPET can have up to 32 timers. It is possible to configure the
first two timers as legacy replacements for 8254 and RTC periodic timers.
A specification done by Intel and Microsoft can be found at
-<http://www.intel.com/hardwaredesign/hpetspec.htm>.
+<http://www.intel.com/technology/architecture/hpetspec.htm>.
The driver supports detection of HPET driver allocation and initialization
of the HPET before the driver module_init routine is called. This enables
diff --git a/Documentation/hwmon/adm1031 b/Documentation/hwmon/adm1031
index 130a38382b98..be92a77da1d5 100644
--- a/Documentation/hwmon/adm1031
+++ b/Documentation/hwmon/adm1031
@@ -6,13 +6,13 @@ Supported chips:
Prefix: 'adm1030'
Addresses scanned: I2C 0x2c to 0x2e
Datasheet: Publicly available at the Analog Devices website
- http://products.analog.com/products/info.asp?product=ADM1030
+ http://www.analog.com/en/prod/0%2C2877%2CADM1030%2C00.html
* Analog Devices ADM1031
Prefix: 'adm1031'
Addresses scanned: I2C 0x2c to 0x2e
Datasheet: Publicly available at the Analog Devices website
- http://products.analog.com/products/info.asp?product=ADM1031
+ http://www.analog.com/en/prod/0%2C2877%2CADM1031%2C00.html
Authors:
Alexandre d'Alton <alex@alexdalton.org>
diff --git a/Documentation/hwmon/thmc50 b/Documentation/hwmon/thmc50
new file mode 100644
index 000000000000..9639ca93d559
--- /dev/null
+++ b/Documentation/hwmon/thmc50
@@ -0,0 +1,74 @@
+Kernel driver thmc50
+=====================
+
+Supported chips:
+ * Analog Devices ADM1022
+ Prefix: 'adm1022'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: http://www.analog.com/en/prod/0,2877,ADM1022,00.html
+ * Texas Instruments THMC50
+ Prefix: 'thmc50'
+ Addresses scanned: I2C 0x2c - 0x2e
+ Datasheet: http://focus.ti.com/docs/prod/folders/print/thmc50.html
+
+Author: Krzysztof Helt <krzysztof.h1@wp.pl>
+
+This driver was derived from the 2.4 kernel thmc50.c source file.
+
+Credits:
+ thmc50.c (2.4 kernel):
+ Frodo Looijaard <frodol@dds.nl>
+ Philip Edelbrock <phil@netroedge.com>
+
+Module Parameters
+-----------------
+
+* adm1022_temp3: short array
+ List of adapter,address pairs to force chips into ADM1022 mode with
+ second remote temperature. This does not work for original THMC50 chips.
+
+Description
+-----------
+
+The THMC50 implements: an internal temperature sensor, support for an
+external diode-type temperature sensor (compatible w/ the diode sensor inside
+many processors), and a controllable fan/analog_out DAC. For the temperature
+sensors, limits can be set through the appropriate Overtemperature Shutdown
+register and Hysteresis register. Each value can be set and read to half-degree
+accuracy. An alarm is issued (usually to a connected LM78) when the
+temperature gets higher then the Overtemperature Shutdown value; it stays on
+until the temperature falls below the Hysteresis value. All temperatures are in
+degrees Celsius, and are guaranteed within a range of -55 to +125 degrees.
+
+The THMC50 only updates its values each 1.5 seconds; reading it more often
+will do no harm, but will return 'old' values.
+
+The THMC50 is usually used in combination with LM78-like chips, to measure
+the temperature of the processor(s).
+
+The ADM1022 works the same as THMC50 but it is faster (5 Hz instead of
+1 Hz for THMC50). It can be also put in a new mode to handle additional
+remote temperature sensor. The driver use the mode set by BIOS by default.
+
+In case the BIOS is broken and the mode is set incorrectly, you can force
+the mode with additional remote temperature with adm1022_temp3 parameter.
+A typical symptom of wrong setting is a fan forced to full speed.
+
+Driver Features
+---------------
+
+The driver provides up to three temperatures:
+
+temp1 -- internal
+temp2 -- remote
+temp3 -- 2nd remote only for ADM1022
+
+pwm1 -- fan speed (0 = stop, 255 = full)
+pwm1_mode -- always 0 (DC mode)
+
+The value of 0 for pwm1 also forces FAN_OFF signal from the chip,
+so it stops fans even if the value 0 into the ANALOG_OUT register does not.
+
+The driver was tested on Compaq AP550 with two ADM1022 chips (one works
+in the temp3 mode), five temperature readings and two fans.
+
diff --git a/Documentation/i386/zero-page.txt b/Documentation/i386/zero-page.txt
index 75b3680c41eb..6c0817c45683 100644
--- a/Documentation/i386/zero-page.txt
+++ b/Documentation/i386/zero-page.txt
@@ -1,3 +1,13 @@
+---------------------------------------------------------------------------
+!!!!!!!!!!!!!!!WARNING!!!!!!!!
+The zero page is a kernel internal data structure, not a stable ABI. It might change
+without warning and the kernel has no way to detect old version of it.
+If you're writing some external code like a boot loader you should only use
+the stable versioned real mode boot protocol described in boot.txt. Otherwise the kernel
+might break you at any time.
+!!!!!!!!!!!!!WARNING!!!!!!!!!!!
+----------------------------------------------------------------------------
+
Summary of boot_params layout (kernel point of view)
( collected by Hans Lermen and Martin Mares )
diff --git a/Documentation/ja_JP/HOWTO b/Documentation/ja_JP/HOWTO
index b2446a090870..9f08dab1e75b 100644
--- a/Documentation/ja_JP/HOWTO
+++ b/Documentation/ja_JP/HOWTO
@@ -1,23 +1,24 @@
-NOTE:
-This is Japanese translated version of "Documentation/HOWTO".
-This one is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
-and JF Project team <www.linux.or.jp/JF>.
-If you find difference with original file or problem in translation,
-please contact maintainer of this file or JF project.
-
-Please also note that purpose of this file is easier to read for non
-English natives and not to be intended to fork. So, if you have any
-comments or updates of this file, please try to update Original(English)
-file at first.
-
-Last Updated: 2007/06/04
+NOTE:
+This is a version of Documentation/HOWTO translated into Japanese.
+This document is maintained by Tsugikazu Shibata <tshibata@ab.jp.nec.com>
+and the JF Project team <www.linux.or.jp/JF>.
+If you find any difference between this document and the original file
+or a problem with the translation,
+please contact the maintainer of this file or JF project.
+
+Please also note that the purpose of this file is to be easier to read
+for non English (read: Japanese) speakers and is not intended as a
+fork. So if you have any comments or updates for this file, please try
+to update the original English file first.
+
+Last Updated: 2007/07/18
==================================
これは、
-linux-2.6.21/Documentation/HOWTO
+linux-2.6.22/Documentation/HOWTO
の和訳です。
翻訳団体: JF プロジェクト < http://www.linux.or.jp/JF/ >
-翻訳日: 2007/06/04
+翻訳日: 2007/07/16
翻訳者: Tsugikazu Shibata <tshibata at ab dot jp dot nec dot com>
校正者: 松倉さん <nbh--mats at nifty dot com>
小林 雅典さん (Masanori Kobayasi) <zap03216 at nifty dot ne dot jp>
@@ -52,6 +53,7 @@ Linux カーネル開発コミュニティと共に活動するやり方を学
また、このコミュニティがなぜ今うまくまわっているのかという理由の一部も
説明しようと試みています。
+
カーネルは 少量のアーキテクチャ依存部分がアセンブリ言語で書かれている
以外は大部分は C 言語で書かれています。C言語をよく理解していることはカー
ネル開発者には必要です。アーキテクチャ向けの低レベル部分の開発をするの
@@ -141,6 +143,7 @@ Linux カーネルソースツリーは幅広い範囲のドキュメントを
これらのルールに従えばうまくいくことを保証することではありません
が (すべてのパッチは内容とスタイルについて精査を受けるので)、
ルールに従わなければ間違いなくうまくいかないでしょう。
+
この他にパッチを作る方法についてのよくできた記述は-
"The Perfect Patch"
@@ -360,44 +363,42 @@ linux-kernel メーリングリストで収集された多数のパッチと同
git ツリー-
- Kbuild の開発ツリー、Sam Ravnborg <sam@ravnborg.org>
- kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
+ git.kernel.org:/pub/scm/linux/kernel/git/sam/kbuild.git
- ACPI の開発ツリー、 Len Brown <len.brown@intel.com>
- kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
+ git.kernel.org:/pub/scm/linux/kernel/git/lenb/linux-acpi-2.6.git
- Block の開発ツリー、Jens Axboe <axboe@suse.de>
- kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
+ git.kernel.org:/pub/scm/linux/kernel/git/axboe/linux-2.6-block.git
- DRM の開発ツリー、Dave Airlie <airlied@linux.ie>
- kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
+ git.kernel.org:/pub/scm/linux/kernel/git/airlied/drm-2.6.git
- ia64 の開発ツリー、Tony Luck <tony.luck@intel.com>
- kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
-
- - ieee1394 の開発ツリー、Jody McIntyre <scjody@modernduck.com>
- kernel.org:/pub/scm/linux/kernel/git/scjody/ieee1394.git
+ git.kernel.org:/pub/scm/linux/kernel/git/aegl/linux-2.6.git
- infiniband, Roland Dreier <rolandd@cisco.com>
- kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
+ git.kernel.org:/pub/scm/linux/kernel/git/roland/infiniband.git
- libata, Jeff Garzik <jgarzik@pobox.com>
- kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/libata-dev.git
- ネットワークドライバ, Jeff Garzik <jgarzik@pobox.com>
- kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
+ git.kernel.org:/pub/scm/linux/kernel/git/jgarzik/netdev-2.6.git
- pcmcia, Dominik Brodowski <linux@dominikbrodowski.net>
- kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
+ git.kernel.org:/pub/scm/linux/kernel/git/brodo/pcmcia-2.6.git
- SCSI, James Bottomley <James.Bottomley@SteelEye.com>
- kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
-
- その他の git カーネルツリーは http://kernel.org/git に一覧表がありま
- す。
+ git.kernel.org:/pub/scm/linux/kernel/git/jejb/scsi-misc-2.6.git
quilt ツリー-
- USB, PCI ドライバコアと I2C, Greg Kroah-Hartman <gregkh@suse.de>
kernel.org/pub/linux/kernel/people/gregkh/gregkh-2.6/
+ - x86-64 と i386 の仲間 Andi Kleen <ak@suse.de>
+
+ その他のカーネルツリーは http://git.kernel.org/ と MAINTAINERS ファ
+ イルに一覧表があります。
バグレポート
-------------
@@ -508,6 +509,7 @@ MAINTAINERS ファイルにリストがありますので参照してくださ
せん*。単に自分のパッチに対して指摘された問題を全て修正して再送すれば
いいのです。
+
カーネルコミュニティと企業組織のちがい
-----------------------------------------------------------------
@@ -577,6 +579,7 @@ Linux カーネルコミュニティは、一度に大量のコードの塊を
かし、500行のパッチは、正しいことをレビューするのに数時間かかるかも
しれません(時間はパッチのサイズなどにより指数関数に比例してかかりま
す)
+
小さいパッチは何かあったときにデバッグもとても簡単になります。パッ
チを1個1個取り除くのは、とても大きなパッチを当てた後に(かつ、何かお
かしくなった後で)解剖するのに比べればとても簡単です。
@@ -591,6 +594,7 @@ Linux カーネルコミュニティは、一度に大量のコードの塊を
う。先生は簡潔な最高の解をみたいのです。良い生徒はこれを知って
おり、そして最終解の前の中間作業を提出することは決してないので
す"
+
カーネル開発でもこれは同じです。メンテナー達とレビューア達は、
問題を解決する解の背後になる思考プロセスをみたいとは思いません。
彼らは単純であざやかな解決方法をみたいのです。
diff --git a/Documentation/ja_JP/stable_api_nonsense.txt b/Documentation/ja_JP/stable_api_nonsense.txt
index b3f2b27f0881..7653b5cbfed2 100644
--- a/Documentation/ja_JP/stable_api_nonsense.txt
+++ b/Documentation/ja_JP/stable_api_nonsense.txt
@@ -1,17 +1,17 @@
NOTE:
-This is a Japanese translated version of
-"Documentation/stable_api_nonsense.txt".
-This one is maintained by
-IKEDA, Munehiro <m-ikeda@ds.jp.nec.com>
-and JF Project team <http://www.linux.or.jp/JF/>.
-If you find difference with original file or problem in translation,
+This is a version of Documentation/stable_api_nonsense.txt into Japanese.
+This document is maintained by IKEDA, Munehiro <m-ikeda@ds.jp.nec.com>
+and the JF Project team <http://www.linux.or.jp/JF/>.
+If you find any difference between this document and the original file
+or a problem with the translation,
please contact the maintainer of this file or JF project.
-Please also note that purpose of this file is easier to read for non
-English natives and not to be intended to fork. So, if you have any
-comments or updates of this file, please try to update
-Original(English) file at first.
+Please also note that the purpose of this file is to be easier to read
+for non English (read: Japanese) speakers and is not intended as a
+fork. So if you have any comments or updates of this file, please try
+to update the original English file first.
+Last Updated: 2007/07/18
==================================
これは、
linux-2.6.22-rc4/Documentation/stable_api_nonsense.txt の和訳
diff --git a/Documentation/kbuild/kconfig-language.txt b/Documentation/kbuild/kconfig-language.txt
index 536d5bfbdb8d..fe8b0c4892cf 100644
--- a/Documentation/kbuild/kconfig-language.txt
+++ b/Documentation/kbuild/kconfig-language.txt
@@ -98,6 +98,15 @@ applicable everywhere (see syntax).
times, the limit is set to the largest selection.
Reverse dependencies can only be used with boolean or tristate
symbols.
+ Note:
+ select is evil.... select will by brute force set a symbol
+ equal to 'y' without visiting the dependencies. So abusing
+ select you are able to select a symbol FOO even if FOO depends
+ on BAR that is not set. In general use select only for
+ non-visible symbols (no promts anywhere) and for symbols with
+ no dependencies. That will limit the usefulness but on the
+ other hand avoid the illegal configurations all over. kconfig
+ should one day warn about such things.
- numerical ranges: "range" <symbol> <symbol> ["if" <expr>]
This allows to limit the range of possible input values for int
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index 379931e74334..09c0ec100f61 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -30,6 +30,7 @@ the beginning of each description states the restrictions within which a
parameter is applicable:
ACPI ACPI support is enabled.
+ AGP AGP (Accelerated Graphics Port) is enabled.
ALSA ALSA sound support is enabled.
APIC APIC support is enabled.
APM Advanced Power Management support is enabled.
@@ -40,7 +41,6 @@ parameter is applicable:
EIDE EIDE/ATAPI support is enabled.
FB The frame buffer device is enabled.
HW Appropriate hardware is enabled.
- IA-32 IA-32 aka i386 architecture is enabled.
IA-64 IA-64 architecture is enabled.
IOSCHED More than one I/O scheduler is enabled.
IP_PNP IP DHCP, BOOTP, or RARP is enabled.
@@ -57,14 +57,14 @@ parameter is applicable:
MDA MDA console support is enabled.
MOUSE Appropriate mouse support is enabled.
MSI Message Signaled Interrupts (PCI).
- MTD MTD support is enabled.
+ MTD MTD (Memory Technology Device) support is enabled.
NET Appropriate network support is enabled.
NUMA NUMA support is enabled.
GENERIC_TIME The generic timeofday code is enabled.
NFS Appropriate NFS support is enabled.
OSS OSS sound support is enabled.
- PV_OPS A paravirtualized kernel
- PARIDE The ParIDE subsystem is enabled.
+ PV_OPS A paravirtualized kernel is enabled.
+ PARIDE The ParIDE (parallel port IDE) subsystem is enabled.
PARISC The PA-RISC architecture is enabled.
PCI PCI bus support is enabled.
PCMCIA The PCMCIA subsystem is enabled.
@@ -91,6 +91,7 @@ parameter is applicable:
VT Virtual terminal support is enabled.
WDT Watchdog support is enabled.
XT IBM PC/XT MFM hard disk support is enabled.
+ X86-32 X86-32, aka i386 architecture is enabled.
X86-64 X86-64 architecture is enabled.
More X86-64 boot options can be found in
Documentation/x86_64/boot-options.txt .
@@ -122,10 +123,6 @@ and is between 256 and 4096 characters. It is defined in the file
./include/asm/setup.h as COMMAND_LINE_SIZE.
- 53c7xx= [HW,SCSI] Amiga SCSI controllers
- See header of drivers/scsi/53c7xx.c.
- See also Documentation/scsi/ncr53c7xx.txt.
-
acpi= [HW,ACPI,X86-64,i386]
Advanced Configuration and Power Interface
Format: { force | off | ht | strict | noirq }
@@ -224,11 +221,17 @@ and is between 256 and 4096 characters. It is defined in the file
acpi_fake_ecdt [HW,ACPI] Workaround failure due to BIOS lacking ECDT
- acpi_pm_good [IA-32,X86-64]
+ acpi_pm_good [X86-32,X86-64]
Override the pmtimer bug detection: force the kernel
to assume that this machine's pmtimer latches its value
and always returns good values.
+ agp= [AGP]
+ { off | try_unsupported }
+ off: disable AGP support
+ try_unsupported: try to drive unsupported chipsets
+ (may crash computer or cause data corruption)
+
enable_timer_pin_1 [i386,x86-64]
Enable PIN 1 of APIC timer
Can be useful to work around chipset bugs
@@ -281,7 +284,8 @@ and is between 256 and 4096 characters. It is defined in the file
not play well with APC CPU idle - disable it if you have
APC and your system crashes randomly.
- apic= [APIC,i386] Change the output verbosity whilst booting
+ apic= [APIC,i386] Advanced Programmable Interrupt Controller
+ Change the output verbosity whilst booting
Format: { quiet (default) | verbose | debug }
Change the amount of debugging information output
when initialising the APIC and IO-APIC components.
@@ -355,7 +359,7 @@ and is between 256 and 4096 characters. It is defined in the file
c101= [NET] Moxa C101 synchronous serial card
- cachesize= [BUGS=IA-32] Override level 2 CPU cache size detection.
+ cachesize= [BUGS=X86-32] Override level 2 CPU cache size detection.
Sometimes CPU hardware bugs make them report the cache
size incorrectly. The kernel will attempt work arounds
to fix known problems, but for some CPUs it is not
@@ -374,7 +378,7 @@ and is between 256 and 4096 characters. It is defined in the file
Value can be changed at runtime via
/selinux/checkreqprot.
- clock= [BUGS=IA-32, HW] gettimeofday clocksource override.
+ clock= [BUGS=X86-32, HW] gettimeofday clocksource override.
[Deprecated]
Forces specified clocksource (if available) to be used
when calculating gettimeofday(). If specified
@@ -392,7 +396,7 @@ and is between 256 and 4096 characters. It is defined in the file
[ARM] imx_timer1,OSTS,netx_timer,mpu_timer2,
pxa_timer,timer3,32k_counter,timer0_1
[AVR32] avr32
- [IA-32] pit,hpet,tsc,vmi-timer;
+ [X86-32] pit,hpet,tsc,vmi-timer;
scx200_hrt on Geode; cyclone on IBM x440
[MIPS] MIPS
[PARISC] cr16
@@ -412,7 +416,7 @@ and is between 256 and 4096 characters. It is defined in the file
over the 8254 in addition to over the IO-APIC. The
kernel tries to set a sensible default.
- hpet= [IA-32,HPET] option to disable HPET and use PIT.
+ hpet= [X86-32,HPET] option to disable HPET and use PIT.
Format: disable
com20020= [HW,NET] ARCnet - COM20020 chipset
@@ -549,7 +553,7 @@ and is between 256 and 4096 characters. It is defined in the file
dtc3181e= [HW,SCSI]
- earlyprintk= [IA-32,X86-64,SH]
+ earlyprintk= [X86-32,X86-64,SH]
earlyprintk=vga
earlyprintk=serial[,ttySn[,baudrate]]
@@ -587,7 +591,7 @@ and is between 256 and 4096 characters. It is defined in the file
eisa_irq_edge= [PARISC,HW]
See header of drivers/parisc/eisa.c.
- elanfreq= [IA-32]
+ elanfreq= [X86-32]
See comment before function elanfreq_setup() in
arch/i386/kernel/cpu/cpufreq/elanfreq.c.
@@ -596,7 +600,7 @@ and is between 256 and 4096 characters. It is defined in the file
See Documentation/block/as-iosched.txt and
Documentation/block/deadline-iosched.txt for details.
- elfcorehdr= [IA-32, X86_64]
+ elfcorehdr= [X86-32, X86_64]
Specifies physical address of start of kernel core
image elf header. Generally kexec loader will
pass this option to capture kernel.
@@ -678,7 +682,7 @@ and is between 256 and 4096 characters. It is defined in the file
hisax= [HW,ISDN]
See Documentation/isdn/README.HiSax.
- hugepages= [HW,IA-32,IA-64] Maximal number of HugeTLB pages.
+ hugepages= [HW,X86-32,IA-64] Maximal number of HugeTLB pages.
i8042.direct [HW] Put keyboard port into non-translated mode
i8042.dumbkbd [HW] Pretend that controller can only read data from
@@ -770,7 +774,8 @@ and is between 256 and 4096 characters. It is defined in the file
See Documentation/nfsroot.txt.
ip2= [HW] Set IO/IRQ pairs for up to 4 IntelliPort boards
- See comment before ip2_setup() in drivers/char/ip2.c.
+ See comment before ip2_setup() in
+ drivers/char/ip2/ip2base.c.
ips= [HW,SCSI] Adaptec / IBM ServeRAID controller
See header of drivers/scsi/ips.c.
@@ -819,7 +824,7 @@ and is between 256 and 4096 characters. It is defined in the file
js= [HW,JOY] Analog joystick
See Documentation/input/joystick.txt.
- kernelcore=nn[KMG] [KNL,IA-32,IA-64,PPC,X86-64] This parameter
+ kernelcore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
specifies the amount of memory usable by the kernel
for non-movable allocations. The requested amount is
spread evenly throughout all nodes in the system. The
@@ -835,7 +840,7 @@ and is between 256 and 4096 characters. It is defined in the file
use the HighMem zone if it exists, and the Normal
zone if it does not.
- movablecore=nn[KMG] [KNL,IA-32,IA-64,PPC,X86-64] This parameter
+ movablecore=nn[KMG] [KNL,X86-32,IA-64,PPC,X86-64] This parameter
is similar to kernelcore except it specifies the
amount of memory used for migratable allocations.
If both kernelcore and movablecore is specified,
@@ -847,28 +852,20 @@ and is between 256 and 4096 characters. It is defined in the file
keepinitrd [HW,ARM]
- kstack=N [IA-32,X86-64] Print N words from the kernel stack
+ kstack=N [X86-32,X86-64] Print N words from the kernel stack
in oops dumps.
l2cr= [PPC]
- lapic [IA-32,APIC] Enable the local APIC even if BIOS
+ lapic [X86-32,APIC] Enable the local APIC even if BIOS
disabled it.
- lapic_timer_c2_ok [IA-32,x86-64,APIC] trust the local apic timer in
+ lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in
C2 power state.
lasi= [HW,SCSI] PARISC LASI driver for the 53c700 chip
Format: addr:<io>,irq:<irq>
- legacy_serial.force [HW,IA-32,X86-64]
- Probe for COM ports at legacy addresses even
- if PNPBIOS or ACPI should describe them. This
- is for working around firmware defects.
-
- llsc*= [IA64] See function print_params() in
- arch/ia64/sn/kernel/llsc4.c.
-
load_ramdisk= [RAM] List of ramdisks to load from floppy
See Documentation/ramdisk.txt.
@@ -974,11 +971,11 @@ and is between 256 and 4096 characters. It is defined in the file
[SCSI] Maximum number of LUNs received.
Should be between 1 and 16384.
- mca-pentium [BUGS=IA-32]
+ mca-pentium [BUGS=X86-32]
mcatest= [IA-64]
- mce [IA-32] Machine Check Exception
+ mce [X86-32] Machine Check Exception
md= [HW] RAID subsystems devices and level
See Documentation/md.txt.
@@ -990,14 +987,14 @@ and is between 256 and 4096 characters. It is defined in the file
mem=nn[KMG] [KNL,BOOT] Force usage of a specific amount of memory
Amount of memory to be used when the kernel is not able
to see the whole system memory or for test.
- [IA-32] Use together with memmap= to avoid physical
+ [X86-32] Use together with memmap= to avoid physical
address space collisions. Without memmap= PCI devices
could be placed at addresses belonging to unused RAM.
- mem=nopentium [BUGS=IA-32] Disable usage of 4MB pages for kernel
+ mem=nopentium [BUGS=X86-32] Disable usage of 4MB pages for kernel
memory.
- memmap=exactmap [KNL,IA-32,X86_64] Enable setting of an exact
+ memmap=exactmap [KNL,X86-32,X86_64] Enable setting of an exact
E820 memory map, as specified by the user.
Such memmap=exactmap lines can be constructed based on
BIOS output or other requirements. See the memmap=nn@ss
@@ -1041,7 +1038,7 @@ and is between 256 and 4096 characters. It is defined in the file
<name>,<region-number>[,<base>,<size>,<buswidth>,<altbuswidth>]
mtdparts= [MTD]
- See drivers/mtd/cmdline.c.
+ See drivers/mtd/cmdlinepart.c.
mtouchusb.raw_coordinates=
[HW] Make the MicroTouch USB driver use raw coordinates
@@ -1083,9 +1080,9 @@ and is between 256 and 4096 characters. It is defined in the file
[NFS] set the maximum lifetime for idmapper cache
entries.
- nmi_watchdog= [KNL,BUGS=IA-32] Debugging features for SMP kernels
+ nmi_watchdog= [KNL,BUGS=X86-32] Debugging features for SMP kernels
- no387 [BUGS=IA-32] Tells the kernel to use the 387 maths
+ no387 [BUGS=X86-32] Tells the kernel to use the 387 maths
emulation library even if a 387 maths coprocessor
is present.
@@ -1116,17 +1113,17 @@ and is between 256 and 4096 characters. It is defined in the file
noexec [IA-64]
- noexec [IA-32,X86-64]
+ noexec [X86-32,X86-64]
noexec=on: enable non-executable mappings (default)
noexec=off: disable nn-executable mappings
- nofxsr [BUGS=IA-32] Disables x86 floating point extended
+ nofxsr [BUGS=X86-32] Disables x86 floating point extended
register save and restore. The kernel will only save
legacy floating-point registers on task switch.
nohlt [BUGS=ARM]
- no-hlt [BUGS=IA-32] Tells the kernel that the hlt
+ no-hlt [BUGS=X86-32] Tells the kernel that the hlt
instruction doesn't work correctly and not to
use it.
@@ -1141,12 +1138,12 @@ and is between 256 and 4096 characters. It is defined in the file
Valid arguments: on, off
Default: on
- noirqbalance [IA-32,SMP,KNL] Disable kernel irq balancing
+ noirqbalance [X86-32,SMP,KNL] Disable kernel irq balancing
- noirqdebug [IA-32] Disables the code which attempts to detect and
+ noirqdebug [X86-32] Disables the code which attempts to detect and
disable unhandled interrupt sources.
- no_timer_check [IA-32,X86_64,APIC] Disables the code which tests for
+ no_timer_check [X86-32,X86_64,APIC] Disables the code which tests for
broken timer IRQ sources.
noisapnp [ISAPNP] Disables ISA PnP code.
@@ -1158,20 +1155,20 @@ and is between 256 and 4096 characters. It is defined in the file
nojitter [IA64] Disables jitter checking for ITC timers.
- nolapic [IA-32,APIC] Do not enable or use the local APIC.
+ nolapic [X86-32,APIC] Do not enable or use the local APIC.
- nolapic_timer [IA-32,APIC] Do not use the local APIC timer.
+ nolapic_timer [X86-32,APIC] Do not use the local APIC timer.
noltlbs [PPC] Do not use large page/tlb entries for kernel
lowmem mapping on PPC40x.
nomca [IA-64] Disable machine check abort handling
- nomce [IA-32] Machine Check Exception
+ nomce [X86-32] Machine Check Exception
- noreplace-paravirt [IA-32,PV_OPS] Don't patch paravirt_ops
+ noreplace-paravirt [X86-32,PV_OPS] Don't patch paravirt_ops
- noreplace-smp [IA-32,SMP] Don't replace SMP instructions
+ noreplace-smp [X86-32,SMP] Don't replace SMP instructions
with UP alternatives
noresidual [PPC] Don't use residual data on PReP machines.
@@ -1185,7 +1182,7 @@ and is between 256 and 4096 characters. It is defined in the file
nosbagart [IA-64]
- nosep [BUGS=IA-32] Disables x86 SYSENTER/SYSEXIT support.
+ nosep [BUGS=X86-32] Disables x86 SYSENTER/SYSEXIT support.
nosmp [SMP] Tells an SMP kernel to act as a UP kernel.
@@ -1193,7 +1190,7 @@ and is between 256 and 4096 characters. It is defined in the file
nosync [HW,M68K] Disables sync negotiation for all devices.
- notsc [BUGS=IA-32] Disable Time Stamp Counter
+ notsc [BUGS=X86-32] Disable Time Stamp Counter
nousb [USB] Disable the USB subsystem
@@ -1266,28 +1263,28 @@ and is between 256 and 4096 characters. It is defined in the file
See also Documentation/paride.txt.
pci=option[,option...] [PCI] various PCI subsystem options:
- off [IA-32] don't probe for the PCI bus
- bios [IA-32] force use of PCI BIOS, don't access
+ off [X86-32] don't probe for the PCI bus
+ bios [X86-32] force use of PCI BIOS, don't access
the hardware directly. Use this if your machine
has a non-standard PCI host bridge.
- nobios [IA-32] disallow use of PCI BIOS, only direct
+ nobios [X86-32] disallow use of PCI BIOS, only direct
hardware access methods are allowed. Use this
if you experience crashes upon bootup and you
suspect they are caused by the BIOS.
- conf1 [IA-32] Force use of PCI Configuration
+ conf1 [X86-32] Force use of PCI Configuration
Mechanism 1.
- conf2 [IA-32] Force use of PCI Configuration
+ conf2 [X86-32] Force use of PCI Configuration
Mechanism 2.
- nommconf [IA-32,X86_64] Disable use of MMCONFIG for PCI
+ nommconf [X86-32,X86_64] Disable use of MMCONFIG for PCI
Configuration
nomsi [MSI] If the PCI_MSI kernel config parameter is
enabled, this kernel boot option can be used to
disable the use of MSI interrupts system-wide.
- nosort [IA-32] Don't sort PCI devices according to
+ nosort [X86-32] Don't sort PCI devices according to
order given by the PCI BIOS. This sorting is
done to get a device order compatible with
older kernels.
- biosirq [IA-32] Use PCI BIOS calls to get the interrupt
+ biosirq [X86-32] Use PCI BIOS calls to get the interrupt
routing table. These calls are known to be buggy
on several machines and they hang the machine
when used, but on other computers it's the only
@@ -1295,32 +1292,32 @@ and is between 256 and 4096 characters. It is defined in the file
this option if the kernel is unable to allocate
IRQs or discover secondary PCI buses on your
motherboard.
- rom [IA-32] Assign address space to expansion ROMs.
+ rom [X86-32] Assign address space to expansion ROMs.
Use with caution as certain devices share
address decoders between ROMs and other
resources.
- irqmask=0xMMMM [IA-32] Set a bit mask of IRQs allowed to be
+ irqmask=0xMMMM [X86-32] Set a bit mask of IRQs allowed to be
assigned automatically to PCI devices. You can
make the kernel exclude IRQs of your ISA cards
this way.
- pirqaddr=0xAAAAA [IA-32] Specify the physical address
+ pirqaddr=0xAAAAA [X86-32] Specify the physical address
of the PIRQ table (normally generated
by the BIOS) if it is outside the
F0000h-100000h range.
- lastbus=N [IA-32] Scan all buses thru bus #N. Can be
+ lastbus=N [X86-32] Scan all buses thru bus #N. Can be
useful if the kernel is unable to find your
secondary buses and you want to tell it
explicitly which ones they are.
- assign-busses [IA-32] Always assign all PCI bus
+ assign-busses [X86-32] Always assign all PCI bus
numbers ourselves, overriding
whatever the firmware may have done.
- usepirqmask [IA-32] Honor the possible IRQ mask stored
+ usepirqmask [X86-32] Honor the possible IRQ mask stored
in the BIOS $PIR table. This is needed on
some systems with broken BIOSes, notably
some HP Pavilion N5400 and Omnibook XE3
notebooks. This will have no effect if ACPI
IRQ routing is enabled.
- noacpi [IA-32] Do not use ACPI for IRQ routing
+ noacpi [X86-32] Do not use ACPI for IRQ routing
or for PCI scanning.
routeirq Do IRQ routing for all PCI devices.
This is normally done in pci_enable_device(),
@@ -1469,13 +1466,13 @@ and is between 256 and 4096 characters. It is defined in the file
Run specified binary instead of /init from the ramdisk,
used for early userspace startup. See initrd.
- reboot= [BUGS=IA-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
+ reboot= [BUGS=X86-32,BUGS=ARM,BUGS=IA-64] Rebooting mode
Format: <reboot_mode>[,<reboot_mode2>[,...]]
See arch/*/kernel/reboot.c or arch/*/kernel/process.c
reserve= [KNL,BUGS] Force the kernel to ignore some iomem area
- reservetop= [IA-32]
+ reservetop= [X86-32]
Format: nn[KMG]
Reserves a hole at the top of the kernel virtual
address space.
@@ -1566,7 +1563,7 @@ and is between 256 and 4096 characters. It is defined in the file
Value can be changed at runtime via
/selinux/compat_net.
- serialnumber [BUGS=IA-32]
+ serialnumber [BUGS=X86-32]
sg_def_reserved_size= [SCSI]
@@ -1619,7 +1616,7 @@ and is between 256 and 4096 characters. It is defined in the file
smart2= [HW]
Format: <io1>[,<io2>[,...,<io8>]]
- smp-alt-once [IA-32,SMP] On a hotplug CPU system, only
+ smp-alt-once [X86-32,SMP] On a hotplug CPU system, only
attempt to substitute SMP alternatives once at boot.
smsc-ircc2.nopnp [HW] Don't use PNP to discover SMC devices
@@ -1884,7 +1881,7 @@ and is between 256 and 4096 characters. It is defined in the file
usbhid.mousepoll=
[USBHID] The interval which mice are to be polled at.
- vdso= [IA-32,SH,x86-64]
+ vdso= [X86-32,SH,x86-64]
vdso=2: enable compat VDSO (default with COMPAT_VDSO)
vdso=1: enable VDSO (default)
vdso=0: disable VDSO mapping
@@ -1895,7 +1892,7 @@ and is between 256 and 4096 characters. It is defined in the file
video= [FB] Frame buffer configuration
See Documentation/fb/modedb.txt.
- vga= [BOOT,IA-32] Select a particular video mode
+ vga= [BOOT,X86-32] Select a particular video mode
See Documentation/i386/boot.txt and
Documentation/svga.txt.
Use vga=ask for menu.
@@ -1927,7 +1924,7 @@ and is between 256 and 4096 characters. It is defined in the file
See header of drivers/scsi/wd7000.c.
wdt= [WDT] Watchdog
- See Documentation/watchdog/watchdog.txt.
+ See Documentation/watchdog/wdt.txt.
xd= [HW,XT] Original XT pre-IDE (RLL encoded) disks.
xd_geo= See header of drivers/block/xd.c.
diff --git a/Documentation/keys.txt b/Documentation/keys.txt
index 81d9aa097298..947d57d53453 100644
--- a/Documentation/keys.txt
+++ b/Documentation/keys.txt
@@ -859,9 +859,8 @@ payload contents" for more information.
void unregister_key_type(struct key_type *type);
-Under some circumstances, it may be desirable to desirable to deal with a
-bundle of keys. The facility provides access to the keyring type for managing
-such a bundle:
+Under some circumstances, it may be desirable to deal with a bundle of keys.
+The facility provides access to the keyring type for managing such a bundle:
struct key_type key_type_keyring;
diff --git a/Documentation/kobject.txt b/Documentation/kobject.txt
index e44855513b3d..8ee49ee7c963 100644
--- a/Documentation/kobject.txt
+++ b/Documentation/kobject.txt
@@ -27,7 +27,6 @@ in detail, and briefly here:
- kobjects a simple object.
- kset a set of objects of a certain type.
- ktype a set of helpers for objects of a common type.
-- subsystem a controlling object for a number of ksets.
The kobject infrastructure maintains a close relationship with the
@@ -54,13 +53,15 @@ embedded in larger data structures and replace fields they duplicate.
1.2 Definition
struct kobject {
+ const char * k_name;
char name[KOBJ_NAME_LEN];
- atomic_t refcount;
+ struct kref kref;
struct list_head entry;
struct kobject * parent;
struct kset * kset;
struct kobj_type * ktype;
- struct dentry * dentry;
+ struct sysfs_dirent * sd;
+ wait_queue_head_t poll;
};
void kobject_init(struct kobject *);
@@ -137,8 +138,7 @@ If a kobject does not have a parent when it is registered, its parent
becomes its dominant kset.
If a kobject does not have a parent nor a dominant kset, its directory
-is created at the top-level of the sysfs partition. This should only
-happen for kobjects that are embedded in a struct subsystem.
+is created at the top-level of the sysfs partition.
@@ -150,10 +150,10 @@ A kset is a set of kobjects that are embedded in the same type.
struct kset {
- struct subsystem * subsys;
struct kobj_type * ktype;
struct list_head list;
struct kobject kobj;
+ struct kset_uevent_ops * uevent_ops;
};
@@ -169,8 +169,7 @@ struct kobject * kset_find_obj(struct kset *, char *);
The type that the kobjects are embedded in is described by the ktype
-pointer. The subsystem that the kobject belongs to is pointed to by the
-subsys pointer.
+pointer.
A kset contains a kobject itself, meaning that it may be registered in
the kobject hierarchy and exported via sysfs. More importantly, the
@@ -209,6 +208,58 @@ the hierarchy.
kset_find_obj() may be used to locate a kobject with a particular
name. The kobject, if found, is returned.
+There are also some helper functions which names point to the formerly
+existing "struct subsystem", whose functions have been taken over by
+ksets.
+
+
+decl_subsys(name,type,uevent_ops)
+
+Declares a kset named '<name>_subsys' of type <type> with
+uevent_ops <uevent_ops>. For example,
+
+decl_subsys(devices, &ktype_device, &device_uevent_ops);
+
+is equivalent to doing:
+
+struct kset devices_subsys = {
+ .kobj = {
+ .name = "devices",
+ },
+ .ktype = &ktype_devices,
+ .uevent_ops = &device_uevent_ops,
+};
+
+
+The objects that are registered with a subsystem that use the
+subsystem's default list must have their kset ptr set properly. These
+objects may have embedded kobjects or ksets. The
+following helpers make setting the kset easier:
+
+
+kobj_set_kset_s(obj,subsys)
+
+- Assumes that obj->kobj exists, and is a struct kobject.
+- Sets the kset of that kobject to the kset <subsys>.
+
+
+kset_set_kset_s(obj,subsys)
+
+- Assumes that obj->kset exists, and is a struct kset.
+- Sets the kset of the embedded kobject to the kset <subsys>.
+
+subsys_set_kset(obj,subsys)
+
+- Assumes obj->subsys exists, and is a struct subsystem.
+- Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset.
+
+void subsystem_init(struct kset *s);
+int subsystem_register(struct kset *s);
+void subsystem_unregister(struct kset *s);
+struct kset *subsys_get(struct kset *s);
+void kset_put(struct kset *s);
+
+These are just wrappers around the respective kset_* functions.
2.3 sysfs
@@ -254,114 +305,3 @@ Instances of struct kobj_type are not registered; only referenced by
the kset. A kobj_type may be referenced by an arbitrary number of
ksets, as there may be disparate sets of identical objects.
-
-
-4. subsystems
-
-4.1 Description
-
-A subsystem represents a significant entity of code that maintains an
-arbitrary number of sets of objects of various types. Since the number
-of ksets and the type of objects they contain are variable, a
-generic representation of a subsystem is minimal.
-
-
-struct subsystem {
- struct kset kset;
- struct rw_semaphore rwsem;
-};
-
-int subsystem_register(struct subsystem *);
-void subsystem_unregister(struct subsystem *);
-
-struct subsystem * subsys_get(struct subsystem * s);
-void subsys_put(struct subsystem * s);
-
-
-A subsystem contains an embedded kset so:
-
-- It can be represented in the object hierarchy via the kset's
- embedded kobject.
-
-- It can maintain a default list of objects of one type.
-
-Additional ksets may attach to the subsystem simply by referencing the
-subsystem before they are registered. (This one-way reference means
-that there is no way to determine the ksets that are attached to the
-subsystem.)
-
-All ksets that are attached to a subsystem share the subsystem's R/W
-semaphore.
-
-
-4.2 subsystem Programming Interface.
-
-The subsystem programming interface is simple and does not offer the
-flexibility that the kset and kobject programming interfaces do. They
-may be registered and unregistered, as well as reference counted. Each
-call forwards the calls to their embedded ksets (which forward the
-calls to their embedded kobjects).
-
-
-4.3 Helpers
-
-A number of macros are available to make dealing with subsystems and
-their embedded objects easier.
-
-
-decl_subsys(name,type)
-
-Declares a subsystem named '<name>_subsys', with an embedded kset of
-type <type>. For example,
-
-decl_subsys(devices,&ktype_devices);
-
-is equivalent to doing:
-
-struct subsystem device_subsys = {
- .kset = {
- .kobj = {
- .name = "devices",
- },
- .ktype = &ktype_devices,
- }
-};
-
-
-The objects that are registered with a subsystem that use the
-subsystem's default list must have their kset ptr set properly. These
-objects may have embedded kobjects, ksets, or other subsystems. The
-following helpers make setting the kset easier:
-
-
-kobj_set_kset_s(obj,subsys)
-
-- Assumes that obj->kobj exists, and is a struct kobject.
-- Sets the kset of that kobject to the subsystem's embedded kset.
-
-
-kset_set_kset_s(obj,subsys)
-
-- Assumes that obj->kset exists, and is a struct kset.
-- Sets the kset of the embedded kobject to the subsystem's
- embedded kset.
-
-subsys_set_kset(obj,subsys)
-
-- Assumes obj->subsys exists, and is a struct subsystem.
-- Sets obj->subsys.kset.kobj.kset to the subsystem's embedded kset.
-
-
-4.4 sysfs
-
-subsystems are represented in sysfs via their embedded kobjects. They
-follow the same rules as previously mentioned with no exceptions. They
-typically receive a top-level directory in sysfs, except when their
-embedded kobject is part of another kset, or the parent of the
-embedded kobject is explicitly set.
-
-Note that the subsystem's embedded kset must be 'attached' to the
-subsystem itself in order to use its rwsem. This is done after
-kset_add() has been called. (Not before, because kset_add() uses its
-subsystem for a default parent if it doesn't already have one).
-
diff --git a/Documentation/lguest/Makefile b/Documentation/lguest/Makefile
index 31e794ef5f98..c0b7a4556390 100644
--- a/Documentation/lguest/Makefile
+++ b/Documentation/lguest/Makefile
@@ -13,7 +13,9 @@ LGUEST_GUEST_TOP := ($(CONFIG_PAGE_OFFSET) - 0x08000000)
CFLAGS:=-Wall -Wmissing-declarations -Wmissing-prototypes -O3 -Wl,-T,lguest.lds
LDLIBS:=-lz
-
+# Removing this works for some versions of ld.so (eg. Ubuntu Feisty) and
+# not others (eg. FC7).
+LDFLAGS+=-static
all: lguest.lds lguest
# The linker script on x86 is so complex the only way of creating one
diff --git a/Documentation/lguest/extract b/Documentation/lguest/extract
new file mode 100644
index 000000000000..7730bb6e4b94
--- /dev/null
+++ b/Documentation/lguest/extract
@@ -0,0 +1,58 @@
+#! /bin/sh
+
+set -e
+
+PREFIX=$1
+shift
+
+trap 'rm -r $TMPDIR' 0
+TMPDIR=`mktemp -d`
+
+exec 3>/dev/null
+for f; do
+ while IFS="
+" read -r LINE; do
+ case "$LINE" in
+ *$PREFIX:[0-9]*:\**)
+ NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
+ if [ -f $TMPDIR/$NUM ]; then
+ echo "$TMPDIR/$NUM already exits prior to $f"
+ exit 1
+ fi
+ exec 3>>$TMPDIR/$NUM
+ echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
+ /bin/echo "$LINE" | sed -e "s/$PREFIX:[0-9]*//" -e "s/:\*/*/" >&3
+ ;;
+ *$PREFIX:[0-9]*)
+ NUM=`echo "$LINE" | sed "s/.*$PREFIX:\([0-9]*\).*/\1/"`
+ if [ -f $TMPDIR/$NUM ]; then
+ echo "$TMPDIR/$NUM already exits prior to $f"
+ exit 1
+ fi
+ exec 3>>$TMPDIR/$NUM
+ echo $f | sed 's,\.\./,,g' > $TMPDIR/.$NUM
+ /bin/echo "$LINE" | sed "s/$PREFIX:[0-9]*//" >&3
+ ;;
+ *:\**)
+ /bin/echo "$LINE" | sed -e "s/:\*/*/" -e "s,/\*\*/,," >&3
+ echo >&3
+ exec 3>/dev/null
+ ;;
+ *)
+ /bin/echo "$LINE" >&3
+ ;;
+ esac
+ done < $f
+ echo >&3
+ exec 3>/dev/null
+done
+
+LASTFILE=""
+for f in $TMPDIR/*; do
+ if [ "$LASTFILE" != $(cat $TMPDIR/.$(basename $f) ) ]; then
+ LASTFILE=$(cat $TMPDIR/.$(basename $f) )
+ echo "[ $LASTFILE ]"
+ fi
+ cat $f
+done
+
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c
index 62a8133393e1..f7918401a007 100644
--- a/Documentation/lguest/lguest.c
+++ b/Documentation/lguest/lguest.c
@@ -1,5 +1,10 @@
-/* Simple program to layout "physical" memory for new lguest guest.
- * Linked high to avoid likely physical memory. */
+/*P:100 This is the Launcher code, a simple program which lays out the
+ * "physical" memory for the new Guest by mapping the kernel image and the
+ * virtual devices, then reads repeatedly from /dev/lguest to run the Guest.
+ *
+ * The only trick: the Makefile links it at a high address so it will be clear
+ * of the guest memory region. It means that each Guest cannot have more than
+ * about 2.5G of memory on a normally configured Host. :*/
#define _LARGEFILE64_SOURCE
#define _GNU_SOURCE
#include <stdio.h>
@@ -29,12 +34,20 @@
#include <termios.h>
#include <getopt.h>
#include <zlib.h>
+/*L:110 We can ignore the 28 include files we need for this program, but I do
+ * want to draw attention to the use of kernel-style types.
+ *
+ * As Linus said, "C is a Spartan language, and so should your naming be." I
+ * like these abbreviations and the header we need uses them, so we define them
+ * here.
+ */
typedef unsigned long long u64;
typedef uint32_t u32;
typedef uint16_t u16;
typedef uint8_t u8;
#include "../../include/linux/lguest_launcher.h"
#include "../../include/asm-i386/e820.h"
+/*:*/
#define PAGE_PRESENT 0x7 /* Present, RW, Execute */
#define NET_PEERNUM 1
@@ -43,33 +56,52 @@ typedef uint8_t u8;
#define SIOCBRADDIF 0x89a2 /* add interface to bridge */
#endif
+/*L:120 verbose is both a global flag and a macro. The C preprocessor allows
+ * this, and although I wouldn't recommend it, it works quite nicely here. */
static bool verbose;
#define verbose(args...) \
do { if (verbose) printf(args); } while(0)
+/*:*/
+
+/* The pipe to send commands to the waker process */
static int waker_fd;
+/* The top of guest physical memory. */
static u32 top;
+/* This is our list of devices. */
struct device_list
{
+ /* Summary information about the devices in our list: ready to pass to
+ * select() to ask which need servicing.*/
fd_set infds;
int max_infd;
+ /* The descriptor page for the devices. */
struct lguest_device_desc *descs;
+
+ /* A single linked list of devices. */
struct device *dev;
+ /* ... And an end pointer so we can easily append new devices */
struct device **lastdev;
};
+/* The device structure describes a single device. */
struct device
{
+ /* The linked-list pointer. */
struct device *next;
+ /* The descriptor for this device, as mapped into the Guest. */
struct lguest_device_desc *desc;
+ /* The memory page(s) of this device, if any. Also mapped in Guest. */
void *mem;
- /* Watch this fd if handle_input non-NULL. */
+ /* If handle_input is set, it wants to be called when this file
+ * descriptor is ready. */
int fd;
bool (*handle_input)(int fd, struct device *me);
- /* Watch DMA to this key if handle_input non-NULL. */
+ /* If handle_output is set, it wants to be called when the Guest sends
+ * DMA to this key. */
unsigned long watch_key;
u32 (*handle_output)(int fd, const struct iovec *iov,
unsigned int num, struct device *me);
@@ -78,6 +110,11 @@ struct device
void *priv;
};
+/*L:130
+ * Loading the Kernel.
+ *
+ * We start with couple of simple helper routines. open_or_die() avoids
+ * error-checking code cluttering the callers: */
static int open_or_die(const char *name, int flags)
{
int fd = open(name, flags);
@@ -86,26 +123,38 @@ static int open_or_die(const char *name, int flags)
return fd;
}
+/* map_zeroed_pages() takes a (page-aligned) address and a number of pages. */
static void *map_zeroed_pages(unsigned long addr, unsigned int num)
{
+ /* We cache the /dev/zero file-descriptor so we only open it once. */
static int fd = -1;
if (fd == -1)
fd = open_or_die("/dev/zero", O_RDONLY);
+ /* We use a private mapping (ie. if we write to the page, it will be
+ * copied), and obviously we insist that it be mapped where we ask. */
if (mmap((void *)addr, getpagesize() * num,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, 0)
!= (void *)addr)
err(1, "Mmaping %u pages of /dev/zero @%p", num, (void *)addr);
+
+ /* Returning the address is just a courtesy: can simplify callers. */
return (void *)addr;
}
-/* Find magic string marking entry point, return entry point. */
+/* To find out where to start we look for the magic Guest string, which marks
+ * the code we see in lguest_asm.S. This is a hack which we are currently
+ * plotting to replace with the normal Linux entry point. */
static unsigned long entry_point(void *start, void *end,
unsigned long page_offset)
{
void *p;
+ /* The scan gives us the physical starting address. We want the
+ * virtual address in this case, and fortunately, we already figured
+ * out the physical-virtual difference and passed it here in
+ * "page_offset". */
for (p = start; p < end; p++)
if (memcmp(p, "GenuineLguest", strlen("GenuineLguest")) == 0)
return (long)p + strlen("GenuineLguest") + page_offset;
@@ -113,7 +162,17 @@ static unsigned long entry_point(void *start, void *end,
err(1, "Is this image a genuine lguest?");
}
-/* Returns the entry point */
+/* This routine takes an open vmlinux image, which is in ELF, and maps it into
+ * the Guest memory. ELF = Embedded Linking Format, which is the format used
+ * by all modern binaries on Linux including the kernel.
+ *
+ * The ELF headers give *two* addresses: a physical address, and a virtual
+ * address. The Guest kernel expects to be placed in memory at the physical
+ * address, and the page tables set up so it will correspond to that virtual
+ * address. We return the difference between the virtual and physical
+ * addresses in the "page_offset" pointer.
+ *
+ * We return the starting address. */
static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
unsigned long *page_offset)
{
@@ -122,40 +181,61 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
unsigned int i;
unsigned long start = -1UL, end = 0;
- /* Sanity checks. */
+ /* Sanity checks on the main ELF header: an x86 executable with a
+ * reasonable number of correctly-sized program headers. */
if (ehdr->e_type != ET_EXEC
|| ehdr->e_machine != EM_386
|| ehdr->e_phentsize != sizeof(Elf32_Phdr)
|| ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr))
errx(1, "Malformed elf header");
+ /* An ELF executable contains an ELF header and a number of "program"
+ * headers which indicate which parts ("segments") of the program to
+ * load where. */
+
+ /* We read in all the program headers at once: */
if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0)
err(1, "Seeking to program headers");
if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr))
err(1, "Reading program headers");
+ /* We don't know page_offset yet. */
*page_offset = 0;
- /* We map the loadable segments at virtual addresses corresponding
- * to their physical addresses (our virtual == guest physical). */
+
+ /* Try all the headers: there are usually only three. A read-only one,
+ * a read-write one, and a "note" section which isn't loadable. */
for (i = 0; i < ehdr->e_phnum; i++) {
+ /* If this isn't a loadable segment, we ignore it */
if (phdr[i].p_type != PT_LOAD)
continue;
verbose("Section %i: size %i addr %p\n",
i, phdr[i].p_memsz, (void *)phdr[i].p_paddr);
- /* We expect linear address space. */
+ /* We expect a simple linear address space: every segment must
+ * have the same difference between virtual (p_vaddr) and
+ * physical (p_paddr) address. */
if (!*page_offset)
*page_offset = phdr[i].p_vaddr - phdr[i].p_paddr;
else if (*page_offset != phdr[i].p_vaddr - phdr[i].p_paddr)
errx(1, "Page offset of section %i different", i);
+ /* We track the first and last address we mapped, so we can
+ * tell entry_point() where to scan. */
if (phdr[i].p_paddr < start)
start = phdr[i].p_paddr;
if (phdr[i].p_paddr + phdr[i].p_filesz > end)
end = phdr[i].p_paddr + phdr[i].p_filesz;
- /* We map everything private, writable. */
+ /* We map this section of the file at its physical address. We
+ * map it read & write even if the header says this segment is
+ * read-only. The kernel really wants to be writable: it
+ * patches its own instructions which would normally be
+ * read-only.
+ *
+ * MAP_PRIVATE means that the page won't be copied until a
+ * write is done to it. This allows us to share much of the
+ * kernel memory between Guests. */
addr = mmap((void *)phdr[i].p_paddr,
phdr[i].p_filesz,
PROT_READ|PROT_WRITE|PROT_EXEC,
@@ -169,7 +249,31 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr,
return entry_point((void *)start, (void *)end, *page_offset);
}
-/* This is amazingly reliable. */
+/*L:170 Prepare to be SHOCKED and AMAZED. And possibly a trifle nauseated.
+ *
+ * We know that CONFIG_PAGE_OFFSET sets what virtual address the kernel expects
+ * to be. We don't know what that option was, but we can figure it out
+ * approximately by looking at the addresses in the code. I chose the common
+ * case of reading a memory location into the %eax register:
+ *
+ * movl <some-address>, %eax
+ *
+ * This gets encoded as five bytes: "0xA1 <4-byte-address>". For example,
+ * "0xA1 0x18 0x60 0x47 0xC0" reads the address 0xC0476018 into %eax.
+ *
+ * In this example can guess that the kernel was compiled with
+ * CONFIG_PAGE_OFFSET set to 0xC0000000 (it's always a round number). If the
+ * kernel were larger than 16MB, we might see 0xC1 addresses show up, but our
+ * kernel isn't that bloated yet.
+ *
+ * Unfortunately, x86 has variable-length instructions, so finding this
+ * particular instruction properly involves writing a disassembler. Instead,
+ * we rely on statistics. We look for "0xA1" and tally the different bytes
+ * which occur 4 bytes later (the "0xC0" in our example above). When one of
+ * those bytes appears three times, we can be reasonably confident that it
+ * forms the start of CONFIG_PAGE_OFFSET.
+ *
+ * This is amazingly reliable. */
static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
{
unsigned int i, possibilities[256] = { 0 };
@@ -182,30 +286,52 @@ static unsigned long intuit_page_offset(unsigned char *img, unsigned long len)
errx(1, "could not determine page offset");
}
+/*L:160 Unfortunately the entire ELF image isn't compressed: the segments
+ * which need loading are extracted and compressed raw. This denies us the
+ * information we need to make a fully-general loader. */
static unsigned long unpack_bzimage(int fd, unsigned long *page_offset)
{
gzFile f;
int ret, len = 0;
+ /* A bzImage always gets loaded at physical address 1M. This is
+ * actually configurable as CONFIG_PHYSICAL_START, but as the comment
+ * there says, "Don't change this unless you know what you are doing".
+ * Indeed. */
void *img = (void *)0x100000;
+ /* gzdopen takes our file descriptor (carefully placed at the start of
+ * the GZIP header we found) and returns a gzFile. */
f = gzdopen(fd, "rb");
+ /* We read it into memory in 64k chunks until we hit the end. */
while ((ret = gzread(f, img + len, 65536)) > 0)
len += ret;
if (ret < 0)
err(1, "reading image from bzImage");
verbose("Unpacked size %i addr %p\n", len, img);
+
+ /* Without the ELF header, we can't tell virtual-physical gap. This is
+ * CONFIG_PAGE_OFFSET, and people do actually change it. Fortunately,
+ * I have a clever way of figuring it out from the code itself. */
*page_offset = intuit_page_offset(img, len);
return entry_point(img, img + len, *page_offset);
}
+/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're
+ * supposed to jump into it and it will unpack itself. We can't do that
+ * because the Guest can't run the unpacking code, and adding features to
+ * lguest kills puppies, so we don't want to.
+ *
+ * The bzImage is formed by putting the decompressing code in front of the
+ * compressed kernel code. So we can simple scan through it looking for the
+ * first "gzip" header, and start decompressing from there. */
static unsigned long load_bzimage(int fd, unsigned long *page_offset)
{
unsigned char c;
int state = 0;
- /* Ugly brute force search for gzip header. */
+ /* GZIP header is 0x1F 0x8B <method> <flags>... <compressed-by>. */
while (read(fd, &c, 1) == 1) {
switch (state) {
case 0:
@@ -222,8 +348,10 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
state++;
break;
case 9:
+ /* Seek back to the start of the gzip header. */
lseek(fd, -10, SEEK_CUR);
- if (c != 0x03) /* Compressed under UNIX. */
+ /* One final check: "compressed under UNIX". */
+ if (c != 0x03)
state = -1;
else
return unpack_bzimage(fd, page_offset);
@@ -232,25 +360,43 @@ static unsigned long load_bzimage(int fd, unsigned long *page_offset)
errx(1, "Could not find kernel in bzImage");
}
+/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels
+ * come wrapped up in the self-decompressing "bzImage" format. With some funky
+ * coding, we can load those, too. */
static unsigned long load_kernel(int fd, unsigned long *page_offset)
{
Elf32_Ehdr hdr;
+ /* Read in the first few bytes. */
if (read(fd, &hdr, sizeof(hdr)) != sizeof(hdr))
err(1, "Reading kernel");
+ /* If it's an ELF file, it starts with "\177ELF" */
if (memcmp(hdr.e_ident, ELFMAG, SELFMAG) == 0)
return map_elf(fd, &hdr, page_offset);
+ /* Otherwise we assume it's a bzImage, and try to unpack it */
return load_bzimage(fd, page_offset);
}
+/* This is a trivial little helper to align pages. Andi Kleen hated it because
+ * it calls getpagesize() twice: "it's dumb code."
+ *
+ * Kernel guys get really het up about optimization, even when it's not
+ * necessary. I leave this code as a reaction against that. */
static inline unsigned long page_align(unsigned long addr)
{
+ /* Add upwards and truncate downwards. */
return ((addr + getpagesize()-1) & ~(getpagesize()-1));
}
-/* initrd gets loaded at top of memory: return length. */
+/*L:180 An "initial ram disk" is a disk image loaded into memory along with
+ * the kernel which the kernel can use to boot from without needing any
+ * drivers. Most distributions now use this as standard: the initrd contains
+ * the code to load the appropriate driver modules for the current machine.
+ *
+ * Importantly, James Morris works for RedHat, and Fedora uses initrds for its
+ * kernels. He sent me this (and tells me when I break it). */
static unsigned long load_initrd(const char *name, unsigned long mem)
{
int ifd;
@@ -259,21 +405,35 @@ static unsigned long load_initrd(const char *name, unsigned long mem)
void *iaddr;
ifd = open_or_die(name, O_RDONLY);
+ /* fstat() is needed to get the file size. */
if (fstat(ifd, &st) < 0)
err(1, "fstat() on initrd '%s'", name);
+ /* The length needs to be rounded up to a page size: mmap needs the
+ * address to be page aligned. */
len = page_align(st.st_size);
+ /* We map the initrd at the top of memory. */
iaddr = mmap((void *)mem - len, st.st_size,
PROT_READ|PROT_EXEC|PROT_WRITE,
MAP_FIXED|MAP_PRIVATE, ifd, 0);
if (iaddr != (void *)mem - len)
err(1, "Mmaping initrd '%s' returned %p not %p",
name, iaddr, (void *)mem - len);
+ /* Once a file is mapped, you can close the file descriptor. It's a
+ * little odd, but quite useful. */
close(ifd);
verbose("mapped initrd %s size=%lu @ %p\n", name, st.st_size, iaddr);
+
+ /* We return the initrd size. */
return len;
}
+/* Once we know how much memory we have, and the address the Guest kernel
+ * expects, we can construct simple linear page tables which will get the Guest
+ * far enough into the boot to create its own.
+ *
+ * We lay them out of the way, just below the initrd (which is why we need to
+ * know its size). */
static unsigned long setup_pagetables(unsigned long mem,
unsigned long initrd_size,
unsigned long page_offset)
@@ -282,23 +442,32 @@ static unsigned long setup_pagetables(unsigned long mem,
unsigned int mapped_pages, i, linear_pages;
unsigned int ptes_per_page = getpagesize()/sizeof(u32);
- /* If we can map all of memory above page_offset, we do so. */
+ /* Ideally we map all physical memory starting at page_offset.
+ * However, if page_offset is 0xC0000000 we can only map 1G of physical
+ * (0xC0000000 + 1G overflows). */
if (mem <= -page_offset)
mapped_pages = mem/getpagesize();
else
mapped_pages = -page_offset/getpagesize();
- /* Each linear PTE page can map ptes_per_page pages. */
+ /* Each PTE page can map ptes_per_page pages: how many do we need? */
linear_pages = (mapped_pages + ptes_per_page-1)/ptes_per_page;
- /* We lay out top-level then linear mapping immediately below initrd */
+ /* We put the toplevel page directory page at the top of memory. */
pgdir = (void *)mem - initrd_size - getpagesize();
+
+ /* Now we use the next linear_pages pages as pte pages */
linear = (void *)pgdir - linear_pages*getpagesize();
+ /* Linear mapping is easy: put every page's address into the mapping in
+ * order. PAGE_PRESENT contains the flags Present, Writable and
+ * Executable. */
for (i = 0; i < mapped_pages; i++)
linear[i] = ((i * getpagesize()) | PAGE_PRESENT);
- /* Now set up pgd so that this memory is at page_offset */
+ /* The top level points to the linear page table pages above. The
+ * entry representing page_offset points to the first one, and they
+ * continue from there. */
for (i = 0; i < mapped_pages; i += ptes_per_page) {
pgdir[(i + page_offset/getpagesize())/ptes_per_page]
= (((u32)linear + i*sizeof(u32)) | PAGE_PRESENT);
@@ -307,9 +476,13 @@ static unsigned long setup_pagetables(unsigned long mem,
verbose("Linear mapping of %u pages in %u pte pages at %p\n",
mapped_pages, linear_pages, linear);
+ /* We return the top level (guest-physical) address: the kernel needs
+ * to know where it is. */
return (unsigned long)pgdir;
}
+/* Simple routine to roll all the commandline arguments together with spaces
+ * between them. */
static void concat(char *dst, char *args[])
{
unsigned int i, len = 0;
@@ -323,6 +496,10 @@ static void concat(char *dst, char *args[])
dst[len] = '\0';
}
+/* This is where we actually tell the kernel to initialize the Guest. We saw
+ * the arguments it expects when we looked at initialize() in lguest_user.c:
+ * the top physical page to allow, the top level pagetable, the entry point and
+ * the page_offset constant for the Guest. */
static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
{
u32 args[] = { LHREQ_INITIALIZE,
@@ -332,8 +509,11 @@ static int tell_kernel(u32 pgdir, u32 start, u32 page_offset)
fd = open_or_die("/dev/lguest", O_RDWR);
if (write(fd, args, sizeof(args)) < 0)
err(1, "Writing to /dev/lguest");
+
+ /* We return the /dev/lguest file descriptor to control this Guest */
return fd;
}
+/*:*/
static void set_fd(int fd, struct device_list *devices)
{
@@ -342,61 +522,108 @@ static void set_fd(int fd, struct device_list *devices)
devices->max_infd = fd;
}
-/* When input arrives, we tell the kernel to kick lguest out with -EAGAIN. */
+/*L:200
+ * The Waker.
+ *
+ * With a console and network devices, we can have lots of input which we need
+ * to process. We could try to tell the kernel what file descriptors to watch,
+ * but handing a file descriptor mask through to the kernel is fairly icky.
+ *
+ * Instead, we fork off a process which watches the file descriptors and writes
+ * the LHREQ_BREAK command to the /dev/lguest filedescriptor to tell the Host
+ * loop to stop running the Guest. This causes it to return from the
+ * /dev/lguest read with -EAGAIN, where it will write to /dev/lguest to reset
+ * the LHREQ_BREAK and wake us up again.
+ *
+ * This, of course, is merely a different *kind* of icky.
+ */
static void wake_parent(int pipefd, int lguest_fd, struct device_list *devices)
{
+ /* Add the pipe from the Launcher to the fdset in the device_list, so
+ * we watch it, too. */
set_fd(pipefd, devices);
for (;;) {
fd_set rfds = devices->infds;
u32 args[] = { LHREQ_BREAK, 1 };
+ /* Wait until input is ready from one of the devices. */
select(devices->max_infd+1, &rfds, NULL, NULL, NULL);
+ /* Is it a message from the Launcher? */
if (FD_ISSET(pipefd, &rfds)) {
int ignorefd;
+ /* If read() returns 0, it means the Launcher has
+ * exited. We silently follow. */
if (read(pipefd, &ignorefd, sizeof(ignorefd)) == 0)
exit(0);
+ /* Otherwise it's telling us there's a problem with one
+ * of the devices, and we should ignore that file
+ * descriptor from now on. */
FD_CLR(ignorefd, &devices->infds);
- } else
+ } else /* Send LHREQ_BREAK command. */
write(lguest_fd, args, sizeof(args));
}
}
+/* This routine just sets up a pipe to the Waker process. */
static int setup_waker(int lguest_fd, struct device_list *device_list)
{
int pipefd[2], child;
+ /* We create a pipe to talk to the waker, and also so it knows when the
+ * Launcher dies (and closes pipe). */
pipe(pipefd);
child = fork();
if (child == -1)
err(1, "forking");
if (child == 0) {
+ /* Close the "writing" end of our copy of the pipe */
close(pipefd[1]);
wake_parent(pipefd[0], lguest_fd, device_list);
}
+ /* Close the reading end of our copy of the pipe. */
close(pipefd[0]);
+ /* Here is the fd used to talk to the waker. */
return pipefd[1];
}
+/*L:210
+ * Device Handling.
+ *
+ * When the Guest sends DMA to us, it sends us an array of addresses and sizes.
+ * We need to make sure it's not trying to reach into the Launcher itself, so
+ * we have a convenient routine which check it and exits with an error message
+ * if something funny is going on:
+ */
static void *_check_pointer(unsigned long addr, unsigned int size,
unsigned int line)
{
+ /* We have to separately check addr and addr+size, because size could
+ * be huge and addr + size might wrap around. */
if (addr >= top || addr + size >= top)
errx(1, "%s:%i: Invalid address %li", __FILE__, line, addr);
+ /* We return a pointer for the caller's convenience, now we know it's
+ * safe to use. */
return (void *)addr;
}
+/* A macro which transparently hands the line number to the real function. */
#define check_pointer(addr,size) _check_pointer(addr, size, __LINE__)
-/* Returns pointer to dma->used_len */
+/* The Guest has given us the address of a "struct lguest_dma". We check it's
+ * OK and convert it to an iovec (which is a simple array of ptr/size
+ * pairs). */
static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
{
unsigned int i;
struct lguest_dma *udma;
+ /* First we make sure that the array memory itself is valid. */
udma = check_pointer(dma, sizeof(*udma));
+ /* Now we check each element */
for (i = 0; i < LGUEST_MAX_DMA_SECTIONS; i++) {
+ /* A zero length ends the array. */
if (!udma->len[i])
break;
@@ -404,9 +631,15 @@ static u32 *dma2iov(unsigned long dma, struct iovec iov[], unsigned *num)
iov[i].iov_len = udma->len[i];
}
*num = i;
+
+ /* We return the pointer to where the caller should write the amount of
+ * the buffer used. */
return &udma->used_len;
}
+/* This routine gets a DMA buffer from the Guest for a given key, and converts
+ * it to an iovec array. It returns the interrupt the Guest wants when we're
+ * finished, and a pointer to the "used_len" field to fill in. */
static u32 *get_dma_buffer(int fd, void *key,
struct iovec iov[], unsigned int *num, u32 *irq)
{
@@ -414,16 +647,21 @@ static u32 *get_dma_buffer(int fd, void *key,
unsigned long udma;
u32 *res;
+ /* Ask the kernel for a DMA buffer corresponding to this key. */
udma = write(fd, buf, sizeof(buf));
+ /* They haven't registered any, or they're all used? */
if (udma == (unsigned long)-1)
return NULL;
- /* Kernel stashes irq in ->used_len. */
+ /* Convert it into our iovec array */
res = dma2iov(udma, iov, num);
+ /* The kernel stashes irq in ->used_len to get it out to us. */
*irq = *res;
+ /* Return a pointer to ((struct lguest_dma *)udma)->used_len. */
return res;
}
+/* This is a convenient routine to send the Guest an interrupt. */
static void trigger_irq(int fd, u32 irq)
{
u32 buf[] = { LHREQ_IRQ, irq };
@@ -431,6 +669,10 @@ static void trigger_irq(int fd, u32 irq)
err(1, "Triggering irq %i", irq);
}
+/* This simply sets up an iovec array where we can put data to be discarded.
+ * This happens when the Guest doesn't want or can't handle the input: we have
+ * to get rid of it somewhere, and if we bury it in the ceiling space it will
+ * start to smell after a week. */
static void discard_iovec(struct iovec *iov, unsigned int *num)
{
static char discard_buf[1024];
@@ -439,19 +681,24 @@ static void discard_iovec(struct iovec *iov, unsigned int *num)
iov->iov_len = sizeof(discard_buf);
}
+/* Here is the input terminal setting we save, and the routine to restore them
+ * on exit so the user can see what they type next. */
static struct termios orig_term;
static void restore_term(void)
{
tcsetattr(STDIN_FILENO, TCSANOW, &orig_term);
}
+/* We associate some data with the console for our exit hack. */
struct console_abort
{
+ /* How many times have they hit ^C? */
int count;
+ /* When did they start? */
struct timeval start;
};
-/* We DMA input to buffer bound at start of console page. */
+/* This is the routine which handles console input (ie. stdin). */
static bool handle_console_input(int fd, struct device *dev)
{
u32 irq = 0, *lenp;
@@ -460,24 +707,38 @@ static bool handle_console_input(int fd, struct device *dev)
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
struct console_abort *abort = dev->priv;
+ /* First we get the console buffer from the Guest. The key is dev->mem
+ * which was set to 0 in setup_console(). */
lenp = get_dma_buffer(fd, dev->mem, iov, &num, &irq);
if (!lenp) {
+ /* If it's not ready for input, warn and set up to discard. */
warn("console: no dma buffer!");
discard_iovec(iov, &num);
}
+ /* This is why we convert to iovecs: the readv() call uses them, and so
+ * it reads straight into the Guest's buffer. */
len = readv(dev->fd, iov, num);
if (len <= 0) {
+ /* This implies that the console is closed, is /dev/null, or
+ * something went terribly wrong. We still go through the rest
+ * of the logic, though, especially the exit handling below. */
warnx("Failed to get console input, ignoring console.");
len = 0;
}
+ /* If we read the data into the Guest, fill in the length and send the
+ * interrupt. */
if (lenp) {
*lenp = len;
trigger_irq(fd, irq);
}
- /* Three ^C within one second? Exit. */
+ /* Three ^C within one second? Exit.
+ *
+ * This is such a hack, but works surprisingly well. Each ^C has to be
+ * in a buffer by itself, so they can't be too fast. But we check that
+ * we get three within about a second, so they can't be too slow. */
if (len == 1 && ((char *)iov[0].iov_base)[0] == 3) {
if (!abort->count++)
gettimeofday(&abort->start, NULL);
@@ -485,43 +746,60 @@ static bool handle_console_input(int fd, struct device *dev)
struct timeval now;
gettimeofday(&now, NULL);
if (now.tv_sec <= abort->start.tv_sec+1) {
- /* Make sure waker is not blocked in BREAK */
u32 args[] = { LHREQ_BREAK, 0 };
+ /* Close the fd so Waker will know it has to
+ * exit. */
close(waker_fd);
+ /* Just in case waker is blocked in BREAK, send
+ * unbreak now. */
write(fd, args, sizeof(args));
exit(2);
}
abort->count = 0;
}
} else
+ /* Any other key resets the abort counter. */
abort->count = 0;
+ /* Now, if we didn't read anything, put the input terminal back and
+ * return failure (meaning, don't call us again). */
if (!len) {
restore_term();
return false;
}
+ /* Everything went OK! */
return true;
}
+/* Handling console output is much simpler than input. */
static u32 handle_console_output(int fd, const struct iovec *iov,
unsigned num, struct device*dev)
{
+ /* Whatever the Guest sends, write it to standard output. Return the
+ * number of bytes written. */
return writev(STDOUT_FILENO, iov, num);
}
+/* Guest->Host network output is also pretty easy. */
static u32 handle_tun_output(int fd, const struct iovec *iov,
unsigned num, struct device *dev)
{
- /* Now we've seen output, we should warn if we can't get buffers. */
+ /* We put a flag in the "priv" pointer of the network device, and set
+ * it as soon as we see output. We'll see why in handle_tun_input() */
*(bool *)dev->priv = true;
+ /* Whatever packet the Guest sent us, write it out to the tun
+ * device. */
return writev(dev->fd, iov, num);
}
+/* This matches the peer_key() in lguest_net.c. The key for any given slot
+ * is the address of the network device's page plus 4 * the slot number. */
static unsigned long peer_offset(unsigned int peernum)
{
return 4 * peernum;
}
+/* This is where we handle a packet coming in from the tun device */
static bool handle_tun_input(int fd, struct device *dev)
{
u32 irq = 0, *lenp;
@@ -529,17 +807,28 @@ static bool handle_tun_input(int fd, struct device *dev)
unsigned num;
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
+ /* First we get a buffer the Guest has bound to its key. */
lenp = get_dma_buffer(fd, dev->mem+peer_offset(NET_PEERNUM), iov, &num,
&irq);
if (!lenp) {
+ /* Now, it's expected that if we try to send a packet too
+ * early, the Guest won't be ready yet. This is why we set a
+ * flag when the Guest sends its first packet. If it's sent a
+ * packet we assume it should be ready to receive them.
+ *
+ * Actually, this is what the status bits in the descriptor are
+ * for: we should *use* them. FIXME! */
if (*(bool *)dev->priv)
warn("network: no dma buffer!");
discard_iovec(iov, &num);
}
+ /* Read the packet from the device directly into the Guest's buffer. */
len = readv(dev->fd, iov, num);
if (len <= 0)
err(1, "reading network");
+
+ /* Write the used_len, and trigger the interrupt for the Guest */
if (lenp) {
*lenp = len;
trigger_irq(fd, irq);
@@ -547,9 +836,13 @@ static bool handle_tun_input(int fd, struct device *dev)
verbose("tun input packet len %i [%02x %02x] (%s)\n", len,
((u8 *)iov[0].iov_base)[0], ((u8 *)iov[0].iov_base)[1],
lenp ? "sent" : "discarded");
+ /* All good. */
return true;
}
+/* The last device handling routine is block output: the Guest has sent a DMA
+ * to the block device. It will have placed the command it wants in the
+ * "struct lguest_block_page". */
static u32 handle_block_output(int fd, const struct iovec *iov,
unsigned num, struct device *dev)
{
@@ -559,36 +852,64 @@ static u32 handle_block_output(int fd, const struct iovec *iov,
struct iovec reply[LGUEST_MAX_DMA_SECTIONS];
off64_t device_len, off = (off64_t)p->sector * 512;
+ /* First we extract the device length from the dev->priv pointer. */
device_len = *(off64_t *)dev->priv;
+ /* We first check that the read or write is within the length of the
+ * block file. */
if (off >= device_len)
err(1, "Bad offset %llu vs %llu", off, device_len);
+ /* Move to the right location in the block file. This shouldn't fail,
+ * but best to check. */
if (lseek64(dev->fd, off, SEEK_SET) != off)
err(1, "Bad seek to sector %i", p->sector);
verbose("Block: %s at offset %llu\n", p->type ? "WRITE" : "READ", off);
+ /* They were supposed to bind a reply buffer at key equal to the start
+ * of the block device memory. We need this to tell them when the
+ * request is finished. */
lenp = get_dma_buffer(fd, dev->mem, reply, &reply_num, &irq);
if (!lenp)
err(1, "Block request didn't give us a dma buffer");
if (p->type) {
+ /* A write request. The DMA they sent contained the data, so
+ * write it out. */
len = writev(dev->fd, iov, num);
+ /* Grr... Now we know how long the "struct lguest_dma" they
+ * sent was, we make sure they didn't try to write over the end
+ * of the block file (possibly extending it). */
if (off + len > device_len) {
+ /* Trim it back to the correct length */
ftruncate(dev->fd, device_len);
+ /* Die, bad Guest, die. */
errx(1, "Write past end %llu+%u", off, len);
}
+ /* The reply length is 0: we just send back an empty DMA to
+ * interrupt them and tell them the write is finished. */
*lenp = 0;
} else {
+ /* A read request. They sent an empty DMA to start the
+ * request, and we put the read contents into the reply
+ * buffer. */
len = readv(dev->fd, reply, reply_num);
*lenp = len;
}
+ /* The result is 1 (done), 2 if there was an error (short read or
+ * write). */
p->result = 1 + (p->bytes != len);
+ /* Now tell them we've used their reply buffer. */
trigger_irq(fd, irq);
+
+ /* We're supposed to return the number of bytes of the output buffer we
+ * used. But the block device uses the "result" field instead, so we
+ * don't bother. */
return 0;
}
+/* This is the generic routine we call when the Guest sends some DMA out. */
static void handle_output(int fd, unsigned long dma, unsigned long key,
struct device_list *devices)
{
@@ -597,30 +918,53 @@ static void handle_output(int fd, unsigned long dma, unsigned long key,
struct iovec iov[LGUEST_MAX_DMA_SECTIONS];
unsigned num = 0;
+ /* Convert the "struct lguest_dma" they're sending to a "struct
+ * iovec". */
lenp = dma2iov(dma, iov, &num);
+
+ /* Check each device: if they expect output to this key, tell them to
+ * handle it. */
for (i = devices->dev; i; i = i->next) {
if (i->handle_output && key == i->watch_key) {
+ /* We write the result straight into the used_len field
+ * for them. */
*lenp = i->handle_output(fd, iov, num, i);
return;
}
}
+
+ /* This can happen: the kernel sends any SEND_DMA which doesn't match
+ * another Guest to us. It could be that another Guest just left a
+ * network, for example. But it's unusual. */
warnx("Pending dma %p, key %p", (void *)dma, (void *)key);
}
+/* This is called when the waker wakes us up: check for incoming file
+ * descriptors. */
static void handle_input(int fd, struct device_list *devices)
{
+ /* select() wants a zeroed timeval to mean "don't wait". */
struct timeval poll = { .tv_sec = 0, .tv_usec = 0 };
for (;;) {
struct device *i;
fd_set fds = devices->infds;
+ /* If nothing is ready, we're done. */
if (select(devices->max_infd+1, &fds, NULL, NULL, &poll) == 0)
break;
+ /* Otherwise, call the device(s) which have readable
+ * file descriptors and a method of handling them. */
for (i = devices->dev; i; i = i->next) {
if (i->handle_input && FD_ISSET(i->fd, &fds)) {
+ /* If handle_input() returns false, it means we
+ * should no longer service it.
+ * handle_console_input() does this. */
if (!i->handle_input(fd, i)) {
+ /* Clear it from the set of input file
+ * descriptors kept at the head of the
+ * device list. */
FD_CLR(i->fd, &devices->infds);
/* Tell waker to ignore it too... */
write(waker_fd, &i->fd, sizeof(i->fd));
@@ -630,6 +974,15 @@ static void handle_input(int fd, struct device_list *devices)
}
}
+/*L:190
+ * Device Setup
+ *
+ * All devices need a descriptor so the Guest knows it exists, and a "struct
+ * device" so the Launcher can keep track of it. We have common helper
+ * routines to allocate them.
+ *
+ * This routine allocates a new "struct lguest_device_desc" from descriptor
+ * table in the devices array just above the Guest's normal memory. */
static struct lguest_device_desc *
new_dev_desc(struct lguest_device_desc *descs,
u16 type, u16 features, u16 num_pages)
@@ -641,6 +994,8 @@ new_dev_desc(struct lguest_device_desc *descs,
descs[i].type = type;
descs[i].features = features;
descs[i].num_pages = num_pages;
+ /* If they said the device needs memory, we allocate
+ * that now, bumping up the top of Guest memory. */
if (num_pages) {
map_zeroed_pages(top, num_pages);
descs[i].pfn = top/getpagesize();
@@ -652,6 +1007,9 @@ new_dev_desc(struct lguest_device_desc *descs,
errx(1, "too many devices");
}
+/* This monster routine does all the creation and setup of a new device,
+ * including caling new_dev_desc() to allocate the descriptor and device
+ * memory. */
static struct device *new_device(struct device_list *devices,
u16 type, u16 num_pages, u16 features,
int fd,
@@ -664,12 +1022,18 @@ static struct device *new_device(struct device_list *devices,
{
struct device *dev = malloc(sizeof(*dev));
- /* Append to device list. */
+ /* Append to device list. Prepending to a single-linked list is
+ * easier, but the user expects the devices to be arranged on the bus
+ * in command-line order. The first network device on the command line
+ * is eth0, the first block device /dev/lgba, etc. */
*devices->lastdev = dev;
dev->next = NULL;
devices->lastdev = &dev->next;
+ /* Now we populate the fields one at a time. */
dev->fd = fd;
+ /* If we have an input handler for this file descriptor, then we add it
+ * to the device_list's fdset and maxfd. */
if (handle_input)
set_fd(dev->fd, devices);
dev->desc = new_dev_desc(devices->descs, type, features, num_pages);
@@ -680,27 +1044,37 @@ static struct device *new_device(struct device_list *devices,
return dev;
}
+/* Our first setup routine is the console. It's a fairly simple device, but
+ * UNIX tty handling makes it uglier than it could be. */
static void setup_console(struct device_list *devices)
{
struct device *dev;
+ /* If we can save the initial standard input settings... */
if (tcgetattr(STDIN_FILENO, &orig_term) == 0) {
struct termios term = orig_term;
+ /* Then we turn off echo, line buffering and ^C etc. We want a
+ * raw input stream to the Guest. */
term.c_lflag &= ~(ISIG|ICANON|ECHO);
tcsetattr(STDIN_FILENO, TCSANOW, &term);
+ /* If we exit gracefully, the original settings will be
+ * restored so the user can see what they're typing. */
atexit(restore_term);
}
- /* We don't currently require a page for the console. */
+ /* We don't currently require any memory for the console, so we ask for
+ * 0 pages. */
dev = new_device(devices, LGUEST_DEVICE_T_CONSOLE, 0, 0,
STDIN_FILENO, handle_console_input,
LGUEST_CONSOLE_DMA_KEY, handle_console_output);
+ /* We store the console state in dev->priv, and initialize it. */
dev->priv = malloc(sizeof(struct console_abort));
((struct console_abort *)dev->priv)->count = 0;
verbose("device %p: console\n",
(void *)(dev->desc->pfn * getpagesize()));
}
+/* Setting up a block file is also fairly straightforward. */
static void setup_block_file(const char *filename, struct device_list *devices)
{
int fd;
@@ -708,20 +1082,47 @@ static void setup_block_file(const char *filename, struct device_list *devices)
off64_t *device_len;
struct lguest_block_page *p;
+ /* We open with O_LARGEFILE because otherwise we get stuck at 2G. We
+ * open with O_DIRECT because otherwise our benchmarks go much too
+ * fast. */
fd = open_or_die(filename, O_RDWR|O_LARGEFILE|O_DIRECT);
+
+ /* We want one page, and have no input handler (the block file never
+ * has anything interesting to say to us). Our timing will be quite
+ * random, so it should be a reasonable randomness source. */
dev = new_device(devices, LGUEST_DEVICE_T_BLOCK, 1,
LGUEST_DEVICE_F_RANDOMNESS,
fd, NULL, 0, handle_block_output);
+
+ /* We store the device size in the private area */
device_len = dev->priv = malloc(sizeof(*device_len));
+ /* This is the safe way of establishing the size of our device: it
+ * might be a normal file or an actual block device like /dev/hdb. */
*device_len = lseek64(fd, 0, SEEK_END);
- p = dev->mem;
+ /* The device memory is a "struct lguest_block_page". It's zeroed
+ * already, we just need to put in the device size. Block devices
+ * think in sectors (ie. 512 byte chunks), so we translate here. */
+ p = dev->mem;
p->num_sectors = *device_len/512;
verbose("device %p: block %i sectors\n",
(void *)(dev->desc->pfn * getpagesize()), p->num_sectors);
}
-/* We use fnctl locks to reserve network slots (autocleanup!) */
+/*
+ * Network Devices.
+ *
+ * Setting up network devices is quite a pain, because we have three types.
+ * First, we have the inter-Guest network. This is a file which is mapped into
+ * the address space of the Guests who are on the network. Because it is a
+ * shared mapping, the same page underlies all the devices, and they can send
+ * DMA to each other.
+ *
+ * Remember from our network driver, the Guest is told what slot in the page it
+ * is to use. We use exclusive fnctl locks to reserve a slot. If another
+ * Guest is using a slot, the lock will fail and we try another. Because fnctl
+ * locks are cleaned up automatically when we die, this cleverly means that our
+ * reservation on the slot will vanish if we crash. */
static unsigned int find_slot(int netfd, const char *filename)
{
struct flock fl;
@@ -729,26 +1130,33 @@ static unsigned int find_slot(int netfd, const char *filename)
fl.l_type = F_WRLCK;
fl.l_whence = SEEK_SET;
fl.l_len = 1;
+ /* Try a 1 byte lock in each possible position number */
for (fl.l_start = 0;
fl.l_start < getpagesize()/sizeof(struct lguest_net);
fl.l_start++) {
+ /* If we succeed, return the slot number. */
if (fcntl(netfd, F_SETLK, &fl) == 0)
return fl.l_start;
}
errx(1, "No free slots in network file %s", filename);
}
+/* This function sets up the network file */
static void setup_net_file(const char *filename,
struct device_list *devices)
{
int netfd;
struct device *dev;
+ /* We don't use open_or_die() here: for friendliness we create the file
+ * if it doesn't already exist. */
netfd = open(filename, O_RDWR, 0);
if (netfd < 0) {
if (errno == ENOENT) {
netfd = open(filename, O_RDWR|O_CREAT, 0600);
if (netfd >= 0) {
+ /* If we succeeded, initialize the file with a
+ * blank page. */
char page[getpagesize()];
memset(page, 0, sizeof(page));
write(netfd, page, sizeof(page));
@@ -758,11 +1166,15 @@ static void setup_net_file(const char *filename,
err(1, "cannot open net file '%s'", filename);
}
+ /* We need 1 page, and the features indicate the slot to use and that
+ * no checksum is needed. We never touch this device again; it's
+ * between the Guests on the network, so we don't register input or
+ * output handlers. */
dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
find_slot(netfd, filename)|LGUEST_NET_F_NOCSUM,
-1, NULL, 0, NULL);
- /* We overwrite the /dev/zero mapping with the actual file. */
+ /* Map the shared file. */
if (mmap(dev->mem, getpagesize(), PROT_READ|PROT_WRITE,
MAP_FIXED|MAP_SHARED, netfd, 0) != dev->mem)
err(1, "could not mmap '%s'", filename);
@@ -770,6 +1182,7 @@ static void setup_net_file(const char *filename,
(void *)(dev->desc->pfn * getpagesize()), filename,
dev->desc->features & ~LGUEST_NET_F_NOCSUM);
}
+/*:*/
static u32 str2ip(const char *ipaddr)
{
@@ -779,7 +1192,11 @@ static u32 str2ip(const char *ipaddr)
return (byte[0] << 24) | (byte[1] << 16) | (byte[2] << 8) | byte[3];
}
-/* adapted from libbridge */
+/* This code is "adapted" from libbridge: it attaches the Host end of the
+ * network device to the bridge device specified by the command line.
+ *
+ * This is yet another James Morris contribution (I'm an IP-level guy, so I
+ * dislike bridging), and I just try not to break it. */
static void add_to_bridge(int fd, const char *if_name, const char *br_name)
{
int ifidx;
@@ -798,12 +1215,16 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name)
err(1, "can't add %s to bridge %s", if_name, br_name);
}
+/* This sets up the Host end of the network device with an IP address, brings
+ * it up so packets will flow, the copies the MAC address into the hwaddr
+ * pointer (in practice, the Host's slot in the network device's memory). */
static void configure_device(int fd, const char *devname, u32 ipaddr,
unsigned char hwaddr[6])
{
struct ifreq ifr;
struct sockaddr_in *sin = (struct sockaddr_in *)&ifr.ifr_addr;
+ /* Don't read these incantations. Just cut & paste them like I did! */
memset(&ifr, 0, sizeof(ifr));
strcpy(ifr.ifr_name, devname);
sin->sin_family = AF_INET;
@@ -814,12 +1235,19 @@ static void configure_device(int fd, const char *devname, u32 ipaddr,
if (ioctl(fd, SIOCSIFFLAGS, &ifr) != 0)
err(1, "Bringing interface %s up", devname);
+ /* SIOC stands for Socket I/O Control. G means Get (vs S for Set
+ * above). IF means Interface, and HWADDR is hardware address.
+ * Simple! */
if (ioctl(fd, SIOCGIFHWADDR, &ifr) != 0)
err(1, "getting hw address for %s", devname);
-
memcpy(hwaddr, ifr.ifr_hwaddr.sa_data, 6);
}
+/*L:195 The other kind of network is a Host<->Guest network. This can either
+ * use briding or routing, but the principle is the same: it uses the "tun"
+ * device to inject packets into the Host as if they came in from a normal
+ * network card. We just shunt packets between the Guest and the tun
+ * device. */
static void setup_tun_net(const char *arg, struct device_list *devices)
{
struct device *dev;
@@ -828,36 +1256,56 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
u32 ip;
const char *br_name = NULL;
+ /* We open the /dev/net/tun device and tell it we want a tap device. A
+ * tap device is like a tun device, only somehow different. To tell
+ * the truth, I completely blundered my way through this code, but it
+ * works now! */
netfd = open_or_die("/dev/net/tun", O_RDWR);
memset(&ifr, 0, sizeof(ifr));
ifr.ifr_flags = IFF_TAP | IFF_NO_PI;
strcpy(ifr.ifr_name, "tap%d");
if (ioctl(netfd, TUNSETIFF, &ifr) != 0)
err(1, "configuring /dev/net/tun");
+ /* We don't need checksums calculated for packets coming in this
+ * device: trust us! */
ioctl(netfd, TUNSETNOCSUM, 1);
- /* You will be peer 1: we should create enough jitter to randomize */
+ /* We create the net device with 1 page, using the features field of
+ * the descriptor to tell the Guest it is in slot 1 (NET_PEERNUM), and
+ * that the device has fairly random timing. We do *not* specify
+ * LGUEST_NET_F_NOCSUM: these packets can reach the real world.
+ *
+ * We will put our MAC address is slot 0 for the Guest to see, so
+ * it will send packets to us using the key "peer_offset(0)": */
dev = new_device(devices, LGUEST_DEVICE_T_NET, 1,
NET_PEERNUM|LGUEST_DEVICE_F_RANDOMNESS, netfd,
handle_tun_input, peer_offset(0), handle_tun_output);
+
+ /* We keep a flag which says whether we've seen packets come out from
+ * this network device. */
dev->priv = malloc(sizeof(bool));
*(bool *)dev->priv = false;
+ /* We need a socket to perform the magic network ioctls to bring up the
+ * tap interface, connect to the bridge etc. Any socket will do! */
ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
if (ipfd < 0)
err(1, "opening IP socket");
+ /* If the command line was --tunnet=bridge:<name> do bridging. */
if (!strncmp(BRIDGE_PFX, arg, strlen(BRIDGE_PFX))) {
ip = INADDR_ANY;
br_name = arg + strlen(BRIDGE_PFX);
add_to_bridge(ipfd, ifr.ifr_name, br_name);
- } else
+ } else /* It is an IP address to set up the device with */
ip = str2ip(arg);
- /* We are peer 0, ie. first slot. */
+ /* We are peer 0, ie. first slot, so we hand dev->mem to this routine
+ * to write the MAC address at the start of the device memory. */
configure_device(ipfd, ifr.ifr_name, ip, dev->mem);
- /* Set "promisc" bit: we want every single packet. */
+ /* Set "promisc" bit: we want every single packet if we're going to
+ * bridge to other machines (and otherwise it doesn't matter). */
*((u8 *)dev->mem) |= 0x1;
close(ipfd);
@@ -868,7 +1316,10 @@ static void setup_tun_net(const char *arg, struct device_list *devices)
if (br_name)
verbose("attached to bridge: %s\n", br_name);
}
+/* That's the end of device setup. */
+/*L:220 Finally we reach the core of the Launcher, which runs the Guest, serves
+ * its input and output, and finally, lays it to rest. */
static void __attribute__((noreturn))
run_guest(int lguest_fd, struct device_list *device_list)
{
@@ -880,20 +1331,37 @@ run_guest(int lguest_fd, struct device_list *device_list)
/* We read from the /dev/lguest device to run the Guest. */
readval = read(lguest_fd, arr, sizeof(arr));
+ /* The read can only really return sizeof(arr) (the Guest did a
+ * SEND_DMA to us), or an error. */
+
+ /* For a successful read, arr[0] is the address of the "struct
+ * lguest_dma", and arr[1] is the key the Guest sent to. */
if (readval == sizeof(arr)) {
handle_output(lguest_fd, arr[0], arr[1], device_list);
continue;
+ /* ENOENT means the Guest died. Reading tells us why. */
} else if (errno == ENOENT) {
char reason[1024] = { 0 };
read(lguest_fd, reason, sizeof(reason)-1);
errx(1, "%s", reason);
+ /* EAGAIN means the waker wanted us to look at some input.
+ * Anything else means a bug or incompatible change. */
} else if (errno != EAGAIN)
err(1, "Running guest failed");
+
+ /* Service input, then unset the BREAK which releases
+ * the Waker. */
handle_input(lguest_fd, device_list);
if (write(lguest_fd, args, sizeof(args)) < 0)
err(1, "Resetting break");
}
}
+/*
+ * This is the end of the Launcher.
+ *
+ * But wait! We've seen I/O from the Launcher, and we've seen I/O from the
+ * Drivers. If we were to see the Host kernel I/O code, our understanding
+ * would be complete... :*/
static struct option opts[] = {
{ "verbose", 0, NULL, 'v' },
@@ -911,20 +1379,49 @@ static void usage(void)
"<mem-in-mb> vmlinux [args...]");
}
+/*L:100 The Launcher code itself takes us out into userspace, that scary place
+ * where pointers run wild and free! Unfortunately, like most userspace
+ * programs, it's quite boring (which is why everyone like to hack on the
+ * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it
+ * will get you through this section. Or, maybe not.
+ *
+ * The Launcher binary sits up high, usually starting at address 0xB8000000.
+ * Everything below this is the "physical" memory for the Guest. For example,
+ * if the Guest were to write a "1" at physical address 0, we would see a "1"
+ * in the Launcher at "(int *)0". Guest physical == Launcher virtual.
+ *
+ * This can be tough to get your head around, but usually it just means that we
+ * don't need to do any conversion when the Guest gives us it's "physical"
+ * addresses.
+ */
int main(int argc, char *argv[])
{
+ /* Memory, top-level pagetable, code startpoint, PAGE_OFFSET and size
+ * of the (optional) initrd. */
unsigned long mem = 0, pgdir, start, page_offset, initrd_size = 0;
+ /* A temporary and the /dev/lguest file descriptor. */
int i, c, lguest_fd;
+ /* The list of Guest devices, based on command line arguments. */
struct device_list device_list;
+ /* The boot information for the Guest: at guest-physical address 0. */
void *boot = (void *)0;
+ /* If they specify an initrd file to load. */
const char *initrd_name = NULL;
+ /* First we initialize the device list. Since console and network
+ * device receive input from a file descriptor, we keep an fdset
+ * (infds) and the maximum fd number (max_infd) with the head of the
+ * list. We also keep a pointer to the last device, for easy appending
+ * to the list. */
device_list.max_infd = -1;
device_list.dev = NULL;
device_list.lastdev = &device_list.dev;
FD_ZERO(&device_list.infds);
- /* We need to know how much memory so we can allocate devices. */
+ /* We need to know how much memory so we can set up the device
+ * descriptor and memory pages for the devices as we parse the command
+ * line. So we quickly look through the arguments to find the amount
+ * of memory now. */
for (i = 1; i < argc; i++) {
if (argv[i][0] != '-') {
mem = top = atoi(argv[i]) * 1024 * 1024;
@@ -933,6 +1430,8 @@ int main(int argc, char *argv[])
break;
}
}
+
+ /* The options are fairly straight-forward */
while ((c = getopt_long(argc, argv, "v", opts, NULL)) != EOF) {
switch (c) {
case 'v':
@@ -955,42 +1454,71 @@ int main(int argc, char *argv[])
usage();
}
}
+ /* After the other arguments we expect memory and kernel image name,
+ * followed by command line arguments for the kernel. */
if (optind + 2 > argc)
usage();
- /* We need a console device */
+ /* We always have a console device */
setup_console(&device_list);
- /* First we map /dev/zero over all of guest-physical memory. */
+ /* We start by mapping anonymous pages over all of guest-physical
+ * memory range. This fills it with 0, and ensures that the Guest
+ * won't be killed when it tries to access it. */
map_zeroed_pages(0, mem / getpagesize());
/* Now we load the kernel */
start = load_kernel(open_or_die(argv[optind+1], O_RDONLY),
&page_offset);
- /* Map the initrd image if requested */
+ /* Map the initrd image if requested (at top of physical memory) */
if (initrd_name) {
initrd_size = load_initrd(initrd_name, mem);
+ /* These are the location in the Linux boot header where the
+ * start and size of the initrd are expected to be found. */
*(unsigned long *)(boot+0x218) = mem - initrd_size;
*(unsigned long *)(boot+0x21c) = initrd_size;
+ /* The bootloader type 0xFF means "unknown"; that's OK. */
*(unsigned char *)(boot+0x210) = 0xFF;
}
- /* Set up the initial linar pagetables. */
+ /* Set up the initial linear pagetables, starting below the initrd. */
pgdir = setup_pagetables(mem, initrd_size, page_offset);
- /* E820 memory map: ours is a simple, single region. */
+ /* The Linux boot header contains an "E820" memory map: ours is a
+ * simple, single region. */
*(char*)(boot+E820NR) = 1;
*((struct e820entry *)(boot+E820MAP))
= ((struct e820entry) { 0, mem, E820_RAM });
- /* Command line pointer and command line (at 4096) */
+ /* The boot header contains a command line pointer: we put the command
+ * line after the boot header (at address 4096) */
*(void **)(boot + 0x228) = boot + 4096;
concat(boot + 4096, argv+optind+2);
- /* Paravirt type: 1 == lguest */
+
+ /* The guest type value of "1" tells the Guest it's under lguest. */
*(int *)(boot + 0x23c) = 1;
+ /* We tell the kernel to initialize the Guest: this returns the open
+ * /dev/lguest file descriptor. */
lguest_fd = tell_kernel(pgdir, start, page_offset);
+
+ /* We fork off a child process, which wakes the Launcher whenever one
+ * of the input file descriptors needs attention. Otherwise we would
+ * run the Guest until it tries to output something. */
waker_fd = setup_waker(lguest_fd, &device_list);
+ /* Finally, run the Guest. This doesn't return. */
run_guest(lguest_fd, &device_list);
}
+/*:*/
+
+/*M:999
+ * Mastery is done: you now know everything I do.
+ *
+ * But surely you have seen code, features and bugs in your wanderings which
+ * you now yearn to attack? That is the real game, and I look forward to you
+ * patching and forking lguest into the Your-Name-Here-visor.
+ *
+ * Farewell, and good coding!
+ * Rusty Russell.
+ */
diff --git a/Documentation/memory-hotplug.txt b/Documentation/memory-hotplug.txt
new file mode 100644
index 000000000000..5fbcc22c98e9
--- /dev/null
+++ b/Documentation/memory-hotplug.txt
@@ -0,0 +1,322 @@
+==============
+Memory Hotplug
+==============
+
+Last Updated: Jul 28 2007
+
+This document is about memory hotplug including how-to-use and current status.
+Because Memory Hotplug is still under development, contents of this text will
+be changed often.
+
+1. Introduction
+ 1.1 purpose of memory hotplug
+ 1.2. Phases of memory hotplug
+ 1.3. Unit of Memory online/offline operation
+2. Kernel Configuration
+3. sysfs files for memory hotplug
+4. Physical memory hot-add phase
+ 4.1 Hardware(Firmware) Support
+ 4.2 Notify memory hot-add event by hand
+5. Logical Memory hot-add phase
+ 5.1. State of memory
+ 5.2. How to online memory
+6. Logical memory remove
+ 6.1 Memory offline and ZONE_MOVABLE
+ 6.2. How to offline memory
+7. Physical memory remove
+8. Future Work List
+
+Note(1): x86_64's has special implementation for memory hotplug.
+ This text does not describe it.
+Note(2): This text assumes that sysfs is mounted at /sys.
+
+
+---------------
+1. Introduction
+---------------
+
+1.1 purpose of memory hotplug
+------------
+Memory Hotplug allows users to increase/decrease the amount of memory.
+Generally, there are two purposes.
+
+(A) For changing the amount of memory.
+ This is to allow a feature like capacity on demand.
+(B) For installing/removing DIMMs or NUMA-nodes physically.
+ This is to exchange DIMMs/NUMA-nodes, reduce power consumption, etc.
+
+(A) is required by highly virtualized environments and (B) is required by
+hardware which supports memory power management.
+
+Linux memory hotplug is designed for both purpose.
+
+
+1.2. Phases of memory hotplug
+---------------
+There are 2 phases in Memory Hotplug.
+ 1) Physical Memory Hotplug phase
+ 2) Logical Memory Hotplug phase.
+
+The First phase is to communicate hardware/firmware and make/erase
+environment for hotplugged memory. Basically, this phase is necessary
+for the purpose (B), but this is good phase for communication between
+highly virtualized environments too.
+
+When memory is hotplugged, the kernel recognizes new memory, makes new memory
+management tables, and makes sysfs files for new memory's operation.
+
+If firmware supports notification of connection of new memory to OS,
+this phase is triggered automatically. ACPI can notify this event. If not,
+"probe" operation by system administration is used instead.
+(see Section 4.).
+
+Logical Memory Hotplug phase is to change memory state into
+avaiable/unavailable for users. Amount of memory from user's view is
+changed by this phase. The kernel makes all memory in it as free pages
+when a memory range is available.
+
+In this document, this phase is described as online/offline.
+
+Logical Memory Hotplug phase is triggred by write of sysfs file by system
+administrator. For the hot-add case, it must be executed after Physical Hotplug
+phase by hand.
+(However, if you writes udev's hotplug scripts for memory hotplug, these
+ phases can be execute in seamless way.)
+
+
+1.3. Unit of Memory online/offline operation
+------------
+Memory hotplug uses SPARSEMEM memory model. SPARSEMEM divides the whole memory
+into chunks of the same size. The chunk is called a "section". The size of
+a section is architecture dependent. For example, power uses 16MiB, ia64 uses
+1GiB. The unit of online/offline operation is "one section". (see Section 3.)
+
+To determine the size of sections, please read this file:
+
+/sys/devices/system/memory/block_size_bytes
+
+This file shows the size of sections in byte.
+
+-----------------------
+2. Kernel Configuration
+-----------------------
+To use memory hotplug feature, kernel must be compiled with following
+config options.
+
+- For all memory hotplug
+ Memory model -> Sparse Memory (CONFIG_SPARSEMEM)
+ Allow for memory hot-add (CONFIG_MEMORY_HOTPLUG)
+
+- To enable memory removal, the followings are also necessary
+ Allow for memory hot remove (CONFIG_MEMORY_HOTREMOVE)
+ Page Migration (CONFIG_MIGRATION)
+
+- For ACPI memory hotplug, the followings are also necessary
+ Memory hotplug (under ACPI Support menu) (CONFIG_ACPI_HOTPLUG_MEMORY)
+ This option can be kernel module.
+
+- As a related configuration, if your box has a feature of NUMA-node hotplug
+ via ACPI, then this option is necessary too.
+ ACPI0004,PNP0A05 and PNP0A06 Container Driver (under ACPI Support menu)
+ (CONFIG_ACPI_CONTAINER).
+ This option can be kernel module too.
+
+--------------------------------
+3 sysfs files for memory hotplug
+--------------------------------
+All sections have their device information under /sys/devices/system/memory as
+
+/sys/devices/system/memory/memoryXXX
+(XXX is section id.)
+
+Now, XXX is defined as start_address_of_section / section_size.
+
+For example, assume 1GiB section size. A device for a memory starting at
+0x100000000 is /sys/device/system/memory/memory4
+(0x100000000 / 1Gib = 4)
+This device covers address range [0x100000000 ... 0x140000000)
+
+Under each section, you can see 3 files.
+
+/sys/devices/system/memory/memoryXXX/phys_index
+/sys/devices/system/memory/memoryXXX/phys_device
+/sys/devices/system/memory/memoryXXX/state
+
+'phys_index' : read-only and contains section id, same as XXX.
+'state' : read-write
+ at read: contains online/offline state of memory.
+ at write: user can specify "online", "offline" command
+'phys_device': read-only: designed to show the name of physical memory device.
+ This is not well implemented now.
+
+NOTE:
+ These directories/files appear after physical memory hotplug phase.
+
+
+--------------------------------
+4. Physical memory hot-add phase
+--------------------------------
+
+4.1 Hardware(Firmware) Support
+------------
+On x86_64/ia64 platform, memory hotplug by ACPI is supported.
+
+In general, the firmware (ACPI) which supports memory hotplug defines
+memory class object of _HID "PNP0C80". When a notify is asserted to PNP0C80,
+Linux's ACPI handler does hot-add memory to the system and calls a hotplug udev
+script. This will be done automatically.
+
+But scripts for memory hotplug are not contained in generic udev package(now).
+You may have to write it by yourself or online/offline memory by hand.
+Please see "How to online memory", "How to offline memory" in this text.
+
+If firmware supports NUMA-node hotplug, and defines an object _HID "ACPI0004",
+"PNP0A05", or "PNP0A06", notification is asserted to it, and ACPI handler
+calls hotplug code for all of objects which are defined in it.
+If memory device is found, memory hotplug code will be called.
+
+
+4.2 Notify memory hot-add event by hand
+------------
+In some environments, especially virtualized environment, firmware will not
+notify memory hotplug event to the kernel. For such environment, "probe"
+interface is supported. This interface depends on CONFIG_ARCH_MEMORY_PROBE.
+
+Now, CONFIG_ARCH_MEMORY_PROBE is supported only by powerpc but it does not
+contain highly architecture codes. Please add config if you need "probe"
+interface.
+
+Probe interface is located at
+/sys/devices/system/memory/probe
+
+You can tell the physical address of new memory to the kernel by
+
+% echo start_address_of_new_memory > /sys/devices/system/memory/probe
+
+Then, [start_address_of_new_memory, start_address_of_new_memory + section_size)
+memory range is hot-added. In this case, hotplug script is not called (in
+current implementation). You'll have to online memory by yourself.
+Please see "How to online memory" in this text.
+
+
+
+------------------------------
+5. Logical Memory hot-add phase
+------------------------------
+
+5.1. State of memory
+------------
+To see (online/offline) state of memory section, read 'state' file.
+
+% cat /sys/device/system/memory/memoryXXX/state
+
+
+If the memory section is online, you'll read "online".
+If the memory section is offline, you'll read "offline".
+
+
+5.2. How to online memory
+------------
+Even if the memory is hot-added, it is not at ready-to-use state.
+For using newly added memory, you have to "online" the memory section.
+
+For onlining, you have to write "online" to the section's state file as:
+
+% echo online > /sys/devices/system/memory/memoryXXX/state
+
+After this, section memoryXXX's state will be 'online' and the amount of
+available memory will be increased.
+
+Currently, newly added memory is added as ZONE_NORMAL (for powerpc, ZONE_DMA).
+This may be changed in future.
+
+
+
+------------------------
+6. Logical memory remove
+------------------------
+
+6.1 Memory offline and ZONE_MOVABLE
+------------
+Memory offlining is more complicated than memory online. Because memory offline
+has to make the whole memory section be unused, memory offline can fail if
+the section includes memory which cannot be freed.
+
+In general, memory offline can use 2 techniques.
+
+(1) reclaim and free all memory in the section.
+(2) migrate all pages in the section.
+
+In the current implementation, Linux's memory offline uses method (2), freeing
+all pages in the section by page migration. But not all pages are
+migratable. Under current Linux, migratable pages are anonymous pages and
+page caches. For offlining a section by migration, the kernel has to guarantee
+that the section contains only migratable pages.
+
+Now, a boot option for making a section which consists of migratable pages is
+supported. By specifying "kernelcore=" or "movablecore=" boot option, you can
+create ZONE_MOVABLE...a zone which is just used for movable pages.
+(See also Documentation/kernel-parameters.txt)
+
+Assume the system has "TOTAL" amount of memory at boot time, this boot option
+creates ZONE_MOVABLE as following.
+
+1) When kernelcore=YYYY boot option is used,
+ Size of memory not for movable pages (not for offline) is YYYY.
+ Size of memory for movable pages (for offline) is TOTAL-YYYY.
+
+2) When movablecore=ZZZZ boot option is used,
+ Size of memory not for movable pages (not for offline) is TOTAL - ZZZZ.
+ Size of memory for movable pages (for offline) is ZZZZ.
+
+
+Note) Unfortunately, there is no information to show which section belongs
+to ZONE_MOVABLE. This is TBD.
+
+
+6.2. How to offline memory
+------------
+You can offline a section by using the same sysfs interface that was used in
+memory onlining.
+
+% echo offline > /sys/devices/system/memory/memoryXXX/state
+
+If offline succeeds, the state of the memory section is changed to be "offline".
+If it fails, some error core (like -EBUSY) will be returned by the kernel.
+Even if a section does not belong to ZONE_MOVABLE, you can try to offline it.
+If it doesn't contain 'unmovable' memory, you'll get success.
+
+A section under ZONE_MOVABLE is considered to be able to be offlined easily.
+But under some busy state, it may return -EBUSY. Even if a memory section
+cannot be offlined due to -EBUSY, you can retry offlining it and may be able to
+offline it (or not).
+(For example, a page is referred to by some kernel internal call and released
+ soon.)
+
+Consideration:
+Memory hotplug's design direction is to make the possibility of memory offlining
+higher and to guarantee unplugging memory under any situation. But it needs
+more work. Returning -EBUSY under some situation may be good because the user
+can decide to retry more or not by himself. Currently, memory offlining code
+does some amount of retry with 120 seconds timeout.
+
+-------------------------
+7. Physical memory remove
+-------------------------
+Need more implementation yet....
+ - Notification completion of remove works by OS to firmware.
+ - Guard from remove if not yet.
+
+--------------
+8. Future Work
+--------------
+ - allowing memory hot-add to ZONE_MOVABLE. maybe we need some switch like
+ sysctl or new control file.
+ - showing memory section and physical device relationship.
+ - showing memory section and node relationship (maybe good for NUMA)
+ - showing memory section is under ZONE_MOVABLE or not
+ - test and make it better memory offlining.
+ - support HugeTLB page migration and offlining.
+ - memmap removing at memory offline.
+ - physical remove memory.
+
diff --git a/Documentation/sched-design-CFS.txt b/Documentation/sched-design-CFS.txt
index 16feebb7bdc0..84901e7c0508 100644
--- a/Documentation/sched-design-CFS.txt
+++ b/Documentation/sched-design-CFS.txt
@@ -83,7 +83,7 @@ Some implementation details:
CFS uses nanosecond granularity accounting and does not rely on any
jiffies or other HZ detail. Thus the CFS scheduler has no notion of
'timeslices' and has no heuristics whatsoever. There is only one
- central tunable:
+ central tunable (you have to switch on CONFIG_SCHED_DEBUG):
/proc/sys/kernel/sched_granularity_ns
diff --git a/Documentation/sched-nice-design.txt b/Documentation/sched-nice-design.txt
new file mode 100644
index 000000000000..e2bae5a577e3
--- /dev/null
+++ b/Documentation/sched-nice-design.txt
@@ -0,0 +1,108 @@
+This document explains the thinking about the revamped and streamlined
+nice-levels implementation in the new Linux scheduler.
+
+Nice levels were always pretty weak under Linux and people continuously
+pestered us to make nice +19 tasks use up much less CPU time.
+
+Unfortunately that was not that easy to implement under the old
+scheduler, (otherwise we'd have done it long ago) because nice level
+support was historically coupled to timeslice length, and timeslice
+units were driven by the HZ tick, so the smallest timeslice was 1/HZ.
+
+In the O(1) scheduler (in 2003) we changed negative nice levels to be
+much stronger than they were before in 2.4 (and people were happy about
+that change), and we also intentionally calibrated the linear timeslice
+rule so that nice +19 level would be _exactly_ 1 jiffy. To better
+understand it, the timeslice graph went like this (cheesy ASCII art
+alert!):
+
+
+ A
+ \ | [timeslice length]
+ \ |
+ \ |
+ \ |
+ \ |
+ \|___100msecs
+ |^ . _
+ | ^ . _
+ | ^ . _
+ -*----------------------------------*-----> [nice level]
+ -20 | +19
+ |
+ |
+
+So that if someone wanted to really renice tasks, +19 would give a much
+bigger hit than the normal linear rule would do. (The solution of
+changing the ABI to extend priorities was discarded early on.)
+
+This approach worked to some degree for some time, but later on with
+HZ=1000 it caused 1 jiffy to be 1 msec, which meant 0.1% CPU usage which
+we felt to be a bit excessive. Excessive _not_ because it's too small of
+a CPU utilization, but because it causes too frequent (once per
+millisec) rescheduling. (and would thus trash the cache, etc. Remember,
+this was long ago when hardware was weaker and caches were smaller, and
+people were running number crunching apps at nice +19.)
+
+So for HZ=1000 we changed nice +19 to 5msecs, because that felt like the
+right minimal granularity - and this translates to 5% CPU utilization.
+But the fundamental HZ-sensitive property for nice+19 still remained,
+and we never got a single complaint about nice +19 being too _weak_ in
+terms of CPU utilization, we only got complaints about it (still) being
+too _strong_ :-)
+
+To sum it up: we always wanted to make nice levels more consistent, but
+within the constraints of HZ and jiffies and their nasty design level
+coupling to timeslices and granularity it was not really viable.
+
+The second (less frequent but still periodically occuring) complaint
+about Linux's nice level support was its assymetry around the origo
+(which you can see demonstrated in the picture above), or more
+accurately: the fact that nice level behavior depended on the _absolute_
+nice level as well, while the nice API itself is fundamentally
+"relative":
+
+ int nice(int inc);
+
+ asmlinkage long sys_nice(int increment)
+
+(the first one is the glibc API, the second one is the syscall API.)
+Note that the 'inc' is relative to the current nice level. Tools like
+bash's "nice" command mirror this relative API.
+
+With the old scheduler, if you for example started a niced task with +1
+and another task with +2, the CPU split between the two tasks would
+depend on the nice level of the parent shell - if it was at nice -10 the
+CPU split was different than if it was at +5 or +10.
+
+A third complaint against Linux's nice level support was that negative
+nice levels were not 'punchy enough', so lots of people had to resort to
+run audio (and other multimedia) apps under RT priorities such as
+SCHED_FIFO. But this caused other problems: SCHED_FIFO is not starvation
+proof, and a buggy SCHED_FIFO app can also lock up the system for good.
+
+The new scheduler in v2.6.23 addresses all three types of complaints:
+
+To address the first complaint (of nice levels being not "punchy"
+enough), the scheduler was decoupled from 'time slice' and HZ concepts
+(and granularity was made a separate concept from nice levels) and thus
+it was possible to implement better and more consistent nice +19
+support: with the new scheduler nice +19 tasks get a HZ-independent
+1.5%, instead of the variable 3%-5%-9% range they got in the old
+scheduler.
+
+To address the second complaint (of nice levels not being consistent),
+the new scheduler makes nice(1) have the same CPU utilization effect on
+tasks, regardless of their absolute nice levels. So on the new
+scheduler, running a nice +10 and a nice 11 task has the same CPU
+utilization "split" between them as running a nice -5 and a nice -4
+task. (one will get 55% of the CPU, the other 45%.) That is why nice
+levels were changed to be "multiplicative" (or exponential) - that way
+it does not matter which nice level you start out from, the 'relative
+result' will always be the same.
+
+The third complaint (of negative nice levels not being "punchy" enough
+and forcing audio apps to run under the more dangerous SCHED_FIFO
+scheduling policy) is addressed by the new scheduler almost
+automatically: stronger negative nice levels are an automatic
+side-effect of the recalibrated dynamic range of nice levels.
diff --git a/Documentation/sched-stats.txt b/Documentation/sched-stats.txt
index 6f72021aae51..442e14d35dea 100644
--- a/Documentation/sched-stats.txt
+++ b/Documentation/sched-stats.txt
@@ -1,10 +1,11 @@
-Version 10 of schedstats includes support for sched_domains, which
-hit the mainline kernel in 2.6.7. Some counters make more sense to be
-per-runqueue; other to be per-domain. Note that domains (and their associated
-information) will only be pertinent and available on machines utilizing
-CONFIG_SMP.
-
-In version 10 of schedstat, there is at least one level of domain
+Version 14 of schedstats includes support for sched_domains, which hit the
+mainline kernel in 2.6.20 although it is identical to the stats from version
+12 which was in the kernel from 2.6.13-2.6.19 (version 13 never saw a kernel
+release). Some counters make more sense to be per-runqueue; other to be
+per-domain. Note that domains (and their associated information) will only
+be pertinent and available on machines utilizing CONFIG_SMP.
+
+In version 14 of schedstat, there is at least one level of domain
statistics for each cpu listed, and there may well be more than one
domain. Domains have no particular names in this implementation, but
the highest numbered one typically arbitrates balancing across all the
@@ -27,7 +28,7 @@ to write their own scripts, the fields are described here.
CPU statistics
--------------
-cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28
+cpu<N> 1 2 3 4 5 6 7 8 9 10 11 12
NOTE: In the sched_yield() statistics, the active queue is considered empty
if it has only one process in it, since obviously the process calling
@@ -39,48 +40,20 @@ First four fields are sched_yield() statistics:
3) # of times just the expired queue was empty
4) # of times sched_yield() was called
-Next four are schedule() statistics:
- 5) # of times the active queue had at least one other process on it
- 6) # of times we switched to the expired queue and reused it
- 7) # of times schedule() was called
- 8) # of times schedule() left the processor idle
-
-Next four are active_load_balance() statistics:
- 9) # of times active_load_balance() was called
- 10) # of times active_load_balance() caused this cpu to gain a task
- 11) # of times active_load_balance() caused this cpu to lose a task
- 12) # of times active_load_balance() tried to move a task and failed
-
-Next three are try_to_wake_up() statistics:
- 13) # of times try_to_wake_up() was called
- 14) # of times try_to_wake_up() successfully moved the awakening task
- 15) # of times try_to_wake_up() attempted to move the awakening task
-
-Next two are wake_up_new_task() statistics:
- 16) # of times wake_up_new_task() was called
- 17) # of times wake_up_new_task() successfully moved the new task
-
-Next one is a sched_migrate_task() statistic:
- 18) # of times sched_migrate_task() was called
+Next three are schedule() statistics:
+ 5) # of times we switched to the expired queue and reused it
+ 6) # of times schedule() was called
+ 7) # of times schedule() left the processor idle
-Next one is a sched_balance_exec() statistic:
- 19) # of times sched_balance_exec() was called
+Next two are try_to_wake_up() statistics:
+ 8) # of times try_to_wake_up() was called
+ 9) # of times try_to_wake_up() was called to wake up the local cpu
Next three are statistics describing scheduling latency:
- 20) sum of all time spent running by tasks on this processor (in ms)
- 21) sum of all time spent waiting to run by tasks on this processor (in ms)
- 22) # of tasks (not necessarily unique) given to the processor
-
-The last six are statistics dealing with pull_task():
- 23) # of times pull_task() moved a task to this cpu when newly idle
- 24) # of times pull_task() stole a task from this cpu when another cpu
- was newly idle
- 25) # of times pull_task() moved a task to this cpu when idle
- 26) # of times pull_task() stole a task from this cpu when another cpu
- was idle
- 27) # of times pull_task() moved a task to this cpu when busy
- 28) # of times pull_task() stole a task from this cpu when another cpu
- was busy
+ 10) sum of all time spent running by tasks on this processor (in jiffies)
+ 11) sum of all time spent waiting to run by tasks on this processor (in
+ jiffies)
+ 12) # of timeslices run on this cpu
Domain statistics
@@ -89,65 +62,95 @@ One of these is produced per domain for each cpu described. (Note that if
CONFIG_SMP is not defined, *no* domains are utilized and these lines
will not appear in the output.)
-domain<N> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
+domain<N> <cpumask> 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36
The first field is a bit mask indicating what cpus this domain operates over.
-The next fifteen are a variety of load_balance() statistics:
-
- 1) # of times in this domain load_balance() was called when the cpu
- was idle
- 2) # of times in this domain load_balance() was called when the cpu
- was busy
- 3) # of times in this domain load_balance() was called when the cpu
- was just becoming idle
- 4) # of times in this domain load_balance() tried to move one or more
- tasks and failed, when the cpu was idle
- 5) # of times in this domain load_balance() tried to move one or more
- tasks and failed, when the cpu was busy
- 6) # of times in this domain load_balance() tried to move one or more
- tasks and failed, when the cpu was just becoming idle
- 7) sum of imbalances discovered (if any) with each call to
- load_balance() in this domain when the cpu was idle
- 8) sum of imbalances discovered (if any) with each call to
- load_balance() in this domain when the cpu was busy
- 9) sum of imbalances discovered (if any) with each call to
- load_balance() in this domain when the cpu was just becoming idle
- 10) # of times in this domain load_balance() was called but did not find
- a busier queue while the cpu was idle
- 11) # of times in this domain load_balance() was called but did not find
- a busier queue while the cpu was busy
- 12) # of times in this domain load_balance() was called but did not find
- a busier queue while the cpu was just becoming idle
- 13) # of times in this domain a busier queue was found while the cpu was
- idle but no busier group was found
- 14) # of times in this domain a busier queue was found while the cpu was
- busy but no busier group was found
- 15) # of times in this domain a busier queue was found while the cpu was
- just becoming idle but no busier group was found
-
-Next two are sched_balance_exec() statistics:
- 17) # of times in this domain sched_balance_exec() successfully pushed
- a task to a new cpu
- 18) # of times in this domain sched_balance_exec() tried but failed to
- push a task to a new cpu
-
-Next two are try_to_wake_up() statistics:
- 19) # of times in this domain try_to_wake_up() tried to move a task based
- on affinity and cache warmth
- 20) # of times in this domain try_to_wake_up() tried to move a task based
- on load balancing
-
+The next 24 are a variety of load_balance() statistics in grouped into types
+of idleness (idle, busy, and newly idle):
+
+ 1) # of times in this domain load_balance() was called when the
+ cpu was idle
+ 2) # of times in this domain load_balance() checked but found
+ the load did not require balancing when the cpu was idle
+ 3) # of times in this domain load_balance() tried to move one or
+ more tasks and failed, when the cpu was idle
+ 4) sum of imbalances discovered (if any) with each call to
+ load_balance() in this domain when the cpu was idle
+ 5) # of times in this domain pull_task() was called when the cpu
+ was idle
+ 6) # of times in this domain pull_task() was called even though
+ the target task was cache-hot when idle
+ 7) # of times in this domain load_balance() was called but did
+ not find a busier queue while the cpu was idle
+ 8) # of times in this domain a busier queue was found while the
+ cpu was idle but no busier group was found
+
+ 9) # of times in this domain load_balance() was called when the
+ cpu was busy
+ 10) # of times in this domain load_balance() checked but found the
+ load did not require balancing when busy
+ 11) # of times in this domain load_balance() tried to move one or
+ more tasks and failed, when the cpu was busy
+ 12) sum of imbalances discovered (if any) with each call to
+ load_balance() in this domain when the cpu was busy
+ 13) # of times in this domain pull_task() was called when busy
+ 14) # of times in this domain pull_task() was called even though the
+ target task was cache-hot when busy
+ 15) # of times in this domain load_balance() was called but did not
+ find a busier queue while the cpu was busy
+ 16) # of times in this domain a busier queue was found while the cpu
+ was busy but no busier group was found
+
+ 17) # of times in this domain load_balance() was called when the
+ cpu was just becoming idle
+ 18) # of times in this domain load_balance() checked but found the
+ load did not require balancing when the cpu was just becoming idle
+ 19) # of times in this domain load_balance() tried to move one or more
+ tasks and failed, when the cpu was just becoming idle
+ 20) sum of imbalances discovered (if any) with each call to
+ load_balance() in this domain when the cpu was just becoming idle
+ 21) # of times in this domain pull_task() was called when newly idle
+ 22) # of times in this domain pull_task() was called even though the
+ target task was cache-hot when just becoming idle
+ 23) # of times in this domain load_balance() was called but did not
+ find a busier queue while the cpu was just becoming idle
+ 24) # of times in this domain a busier queue was found while the cpu
+ was just becoming idle but no busier group was found
+
+ Next three are active_load_balance() statistics:
+ 25) # of times active_load_balance() was called
+ 26) # of times active_load_balance() tried to move a task and failed
+ 27) # of times active_load_balance() successfully moved a task
+
+ Next three are sched_balance_exec() statistics:
+ 28) sbe_cnt is not used
+ 29) sbe_balanced is not used
+ 30) sbe_pushed is not used
+
+ Next three are sched_balance_fork() statistics:
+ 31) sbf_cnt is not used
+ 32) sbf_balanced is not used
+ 33) sbf_pushed is not used
+
+ Next three are try_to_wake_up() statistics:
+ 34) # of times in this domain try_to_wake_up() awoke a task that
+ last ran on a different cpu in this domain
+ 35) # of times in this domain try_to_wake_up() moved a task to the
+ waking cpu because it was cache-cold on its own cpu anyway
+ 36) # of times in this domain try_to_wake_up() started passive balancing
/proc/<pid>/schedstat
----------------
schedstats also adds a new /proc/<pid/schedstat file to include some of
the same information on a per-process level. There are three fields in
-this file correlating to fields 20, 21, and 22 in the CPU fields, but
-they only apply for that process.
+this file correlating for that process to:
+ 1) time spent on the cpu
+ 2) time spent waiting on a runqueue
+ 3) # of timeslices run on this cpu
A program could be easily written to make use of these extra fields to
report on how well a particular process or set of processes is faring
under the scheduler's policies. A simple version of such a program is
available at
- http://eaglet.rain.com/rick/linux/schedstat/v10/latency.c
+ http://eaglet.rain.com/rick/linux/schedstat/v12/latency.c
diff --git a/Documentation/spi/spidev_test.c b/Documentation/spi/spidev_test.c
new file mode 100644
index 000000000000..218e86215297
--- /dev/null
+++ b/Documentation/spi/spidev_test.c
@@ -0,0 +1,202 @@
+/*
+ * SPI testing utility (using spidev driver)
+ *
+ * Copyright (c) 2007 MontaVista Software, Inc.
+ * Copyright (c) 2007 Anton Vorontsov <avorontsov@ru.mvista.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License.
+ *
+ * Cross-compile with cross-gcc -I/path/to/cross-kernel/include
+ */
+
+#include <stdint.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <fcntl.h>
+#include <sys/ioctl.h>
+#include <linux/types.h>
+#include <linux/spi/spidev.h>
+
+#define ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+
+static void pabort(const char *s)
+{
+ perror(s);
+ abort();
+}
+
+static char *device = "/dev/spidev1.1";
+static uint8_t mode;
+static uint8_t bits = 8;
+static uint32_t speed = 500000;
+static uint16_t delay;
+
+static void transfer(int fd)
+{
+ int ret;
+ uint8_t tx[] = {
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0x40, 0x00, 0x00, 0x00, 0x00, 0x95,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF,
+ 0xDE, 0xAD, 0xBE, 0xEF, 0xBA, 0xAD,
+ 0xF0, 0x0D,
+ };
+ uint8_t rx[ARRAY_SIZE(tx)] = {0, };
+ struct spi_ioc_transfer tr = {
+ .tx_buf = (unsigned long)tx,
+ .rx_buf = (unsigned long)rx,
+ .len = ARRAY_SIZE(tx),
+ .delay_usecs = delay,
+ .speed_hz = speed,
+ .bits_per_word = bits,
+ };
+
+ ret = ioctl(fd, SPI_IOC_MESSAGE(1), &tr);
+ if (ret == 1)
+ pabort("can't send spi message");
+
+ for (ret = 0; ret < ARRAY_SIZE(tx); ret++) {
+ if (!(ret % 6))
+ puts("");
+ printf("%.2X ", rx[ret]);
+ }
+ puts("");
+}
+
+void print_usage(char *prog)
+{
+ printf("Usage: %s [-DsbdlHOLC3]\n", prog);
+ puts(" -D --device device to use (default /dev/spidev1.1)\n"
+ " -s --speed max speed (Hz)\n"
+ " -d --delay delay (usec)\n"
+ " -b --bpw bits per word \n"
+ " -l --loop loopback\n"
+ " -H --cpha clock phase\n"
+ " -O --cpol clock polarity\n"
+ " -L --lsb least significant bit first\n"
+ " -C --cs-high chip select active high\n"
+ " -3 --3wire SI/SO signals shared\n");
+ exit(1);
+}
+
+void parse_opts(int argc, char *argv[])
+{
+ while (1) {
+ static struct option lopts[] = {
+ { "device", 1, 0, 'D' },
+ { "speed", 1, 0, 's' },
+ { "delay", 1, 0, 'd' },
+ { "bpw", 1, 0, 'b' },
+ { "loop", 0, 0, 'l' },
+ { "cpha", 0, 0, 'H' },
+ { "cpol", 0, 0, 'O' },
+ { "lsb", 0, 0, 'L' },
+ { "cs-high", 0, 0, 'C' },
+ { "3wire", 0, 0, '3' },
+ { NULL, 0, 0, 0 },
+ };
+ int c;
+
+ c = getopt_long(argc, argv, "D:s:d:b:lHOLC3", lopts, NULL);
+
+ if (c == -1)
+ break;
+
+ switch (c) {
+ case 'D':
+ device = optarg;
+ break;
+ case 's':
+ speed = atoi(optarg);
+ break;
+ case 'd':
+ delay = atoi(optarg);
+ break;
+ case 'b':
+ bits = atoi(optarg);
+ break;
+ case 'l':
+ mode |= SPI_LOOP;
+ break;
+ case 'H':
+ mode |= SPI_CPHA;
+ break;
+ case 'O':
+ mode |= SPI_CPOL;
+ break;
+ case 'L':
+ mode |= SPI_LSB_FIRST;
+ break;
+ case 'C':
+ mode |= SPI_CS_HIGH;
+ break;
+ case '3':
+ mode |= SPI_3WIRE;
+ break;
+ default:
+ print_usage(argv[0]);
+ break;
+ }
+ }
+}
+
+int main(int argc, char *argv[])
+{
+ int ret = 0;
+ int fd;
+
+ parse_opts(argc, argv);
+
+ fd = open(device, O_RDWR);
+ if (fd < 0)
+ pabort("can't open device");
+
+ /*
+ * spi mode
+ */
+ ret = ioctl(fd, SPI_IOC_WR_MODE, &mode);
+ if (ret == -1)
+ pabort("can't set spi mode");
+
+ ret = ioctl(fd, SPI_IOC_RD_MODE, &mode);
+ if (ret == -1)
+ pabort("can't get spi mode");
+
+ /*
+ * bits per word
+ */
+ ret = ioctl(fd, SPI_IOC_WR_BITS_PER_WORD, &bits);
+ if (ret == -1)
+ pabort("can't set bits per word");
+
+ ret = ioctl(fd, SPI_IOC_RD_BITS_PER_WORD, &bits);
+ if (ret == -1)
+ pabort("can't get bits per word");
+
+ /*
+ * max speed hz
+ */
+ ret = ioctl(fd, SPI_IOC_WR_MAX_SPEED_HZ, &speed);
+ if (ret == -1)
+ pabort("can't set max speed hz");
+
+ ret = ioctl(fd, SPI_IOC_RD_MAX_SPEED_HZ, &speed);
+ if (ret == -1)
+ pabort("can't get max speed hz");
+
+ printf("spi mode: %d\n", mode);
+ printf("bits per word: %d\n", bits);
+ printf("max speed: %d Hz (%d KHz)\n", speed, speed/1000);
+
+ transfer(fd);
+
+ close(fd);
+
+ return ret;
+}
diff --git a/Documentation/stable_api_nonsense.txt b/Documentation/stable_api_nonsense.txt
index a2afca3b2bab..847b342b7b20 100644
--- a/Documentation/stable_api_nonsense.txt
+++ b/Documentation/stable_api_nonsense.txt
@@ -10,7 +10,7 @@ kernel to userspace interfaces. The kernel to userspace interface is
the one that application programs use, the syscall interface. That
interface is _very_ stable over time, and will not break. I have old
programs that were built on a pre 0.9something kernel that still work
-just fine on the latest 2.6 kernel release. This interface is the one
+just fine on the latest 2.6 kernel release. That interface is the one
that users and application programmers can count on being stable.
diff --git a/Documentation/sysfs-rules.txt b/Documentation/sysfs-rules.txt
index 42861bb0bc9b..80ef562160bb 100644
--- a/Documentation/sysfs-rules.txt
+++ b/Documentation/sysfs-rules.txt
@@ -1,19 +1,18 @@
Rules on how to access information in the Linux kernel sysfs
-The kernel exported sysfs exports internal kernel implementation-details
+The kernel-exported sysfs exports internal kernel implementation details
and depends on internal kernel structures and layout. It is agreed upon
by the kernel developers that the Linux kernel does not provide a stable
internal API. As sysfs is a direct export of kernel internal
-structures, the sysfs interface can not provide a stable interface eighter,
+structures, the sysfs interface cannot provide a stable interface either;
it may always change along with internal kernel changes.
To minimize the risk of breaking users of sysfs, which are in most cases
low-level userspace applications, with a new kernel release, the users
-of sysfs must follow some rules to use an as abstract-as-possible way to
+of sysfs must follow some rules to use an as-abstract-as-possible way to
access this filesystem. The current udev and HAL programs already
implement this and users are encouraged to plug, if possible, into the
-abstractions these programs provide instead of accessing sysfs
-directly.
+abstractions these programs provide instead of accessing sysfs directly.
But if you really do want or need to access sysfs directly, please follow
the following rules and then your programs should work with future
@@ -25,22 +24,22 @@ versions of the sysfs interface.
implementation details in its own API. Therefore it is not better than
reading directories and opening the files yourself.
Also, it is not actively maintained, in the sense of reflecting the
- current kernel-development. The goal of providing a stable interface
- to sysfs has failed, it causes more problems, than it solves. It
+ current kernel development. The goal of providing a stable interface
+ to sysfs has failed; it causes more problems than it solves. It
violates many of the rules in this document.
- sysfs is always at /sys
Parsing /proc/mounts is a waste of time. Other mount points are a
system configuration bug you should not try to solve. For test cases,
possibly support a SYSFS_PATH environment variable to overwrite the
- applications behavior, but never try to search for sysfs. Never try
+ application's behavior, but never try to search for sysfs. Never try
to mount it, if you are not an early boot script.
- devices are only "devices"
There is no such thing like class-, bus-, physical devices,
interfaces, and such that you can rely on in userspace. Everything is
just simply a "device". Class-, bus-, physical, ... types are just
- kernel implementation details, which should not be expected by
+ kernel implementation details which should not be expected by
applications that look for devices in sysfs.
The properties of a device are:
@@ -48,11 +47,11 @@ versions of the sysfs interface.
- identical to the DEVPATH value in the event sent from the kernel
at device creation and removal
- the unique key to the device at that point in time
- - the kernels path to the device-directory without the leading
+ - the kernel's path to the device directory without the leading
/sys, and always starting with with a slash
- all elements of a devpath must be real directories. Symlinks
pointing to /sys/devices must always be resolved to their real
- target, and the target path must be used to access the device.
+ target and the target path must be used to access the device.
That way the devpath to the device matches the devpath of the
kernel used at event time.
- using or exposing symlink values as elements in a devpath string
@@ -73,17 +72,17 @@ versions of the sysfs interface.
link
- it is retrieved by reading the "driver"-link and using only the
last element of the target path
- - devices which do not have "driver"-link, just do not have a
- driver; copying the driver value in a child device context, is a
+ - devices which do not have "driver"-link just do not have a
+ driver; copying the driver value in a child device context is a
bug in the application
o attributes
- - the files in the device directory or files below a subdirectories
+ - the files in the device directory or files below subdirectories
of the same device directory
- accessing attributes reached by a symlink pointing to another device,
like the "device"-link, is a bug in the application
- Everything else is just a kernel driver-core implementation detail,
+ Everything else is just a kernel driver-core implementation detail
that should not be assumed to be stable across kernel releases.
- Properties of parent devices never belong into a child device.
@@ -91,25 +90,25 @@ versions of the sysfs interface.
context properties. If the device 'eth0' or 'sda' does not have a
"driver"-link, then this device does not have a driver. Its value is empty.
Never copy any property of the parent-device into a child-device. Parent
- device-properties may change dynamically without any notice to the
+ device properties may change dynamically without any notice to the
child device.
-- Hierarchy in a single device-tree
+- Hierarchy in a single device tree
There is only one valid place in sysfs where hierarchy can be examined
and this is below: /sys/devices.
- It is planned, that all device directories will end up in the tree
+ It is planned that all device directories will end up in the tree
below this directory.
- Classification by subsystem
There are currently three places for classification of devices:
/sys/block, /sys/class and /sys/bus. It is planned that these will
- not contain any device-directories themselves, but only flat lists of
+ not contain any device directories themselves, but only flat lists of
symlinks pointing to the unified /sys/devices tree.
All three places have completely different rules on how to access
device information. It is planned to merge all three
- classification-directories into one place at /sys/subsystem,
- following the layout of the bus-directories. All buses and
- classes, including the converted block-subsystem, will show up
+ classification directories into one place at /sys/subsystem,
+ following the layout of the bus directories. All buses and
+ classes, including the converted block subsystem, will show up
there.
The devices belonging to a subsystem will create a symlink in the
"devices" directory at /sys/subsystem/<name>/devices.
@@ -121,38 +120,38 @@ versions of the sysfs interface.
subsystem name.
Assuming /sys/class/<subsystem> and /sys/bus/<subsystem>, or
- /sys/block and /sys/class/block are not interchangeable, is a bug in
+ /sys/block and /sys/class/block are not interchangeable is a bug in
the application.
- Block
- The converted block-subsystem at /sys/class/block, or
+ The converted block subsystem at /sys/class/block or
/sys/subsystem/block will contain the links for disks and partitions
- at the same level, never in a hierarchy. Assuming the block-subsytem to
- contain only disks and not partition-devices in the same flat list is
+ at the same level, never in a hierarchy. Assuming the block subsytem to
+ contain only disks and not partition devices in the same flat list is
a bug in the application.
- "device"-link and <subsystem>:<kernel name>-links
Never depend on the "device"-link. The "device"-link is a workaround
- for the old layout, where class-devices are not created in
- /sys/devices/ like the bus-devices. If the link-resolving of a
- device-directory does not end in /sys/devices/, you can use the
+ for the old layout, where class devices are not created in
+ /sys/devices/ like the bus devices. If the link-resolving of a
+ device directory does not end in /sys/devices/, you can use the
"device"-link to find the parent devices in /sys/devices/. That is the
- single valid use of the "device"-link, it must never appear in any
+ single valid use of the "device"-link; it must never appear in any
path as an element. Assuming the existence of the "device"-link for
a device in /sys/devices/ is a bug in the application.
Accessing /sys/class/net/eth0/device is a bug in the application.
Never depend on the class-specific links back to the /sys/class
directory. These links are also a workaround for the design mistake
- that class-devices are not created in /sys/devices. If a device
+ that class devices are not created in /sys/devices. If a device
directory does not contain directories for child devices, these links
may be used to find the child devices in /sys/class. That is the single
- valid use of these links, they must never appear in any path as an
+ valid use of these links; they must never appear in any path as an
element. Assuming the existence of these links for devices which are
- real child device directories in the /sys/devices tree, is a bug in
+ real child device directories in the /sys/devices tree is a bug in
the application.
- It is planned to remove all these links when when all class-device
+ It is planned to remove all these links when all class device
directories live in /sys/devices.
- Position of devices along device chain can change.
@@ -161,6 +160,5 @@ versions of the sysfs interface.
the chain. You must always request the parent device you are looking for
by its subsystem value. You need to walk up the chain until you find
the device that matches the expected subsystem. Depending on a specific
- position of a parent device, or exposing relative paths, using "../" to
- access the chain of parents, is a bug in the application.
-
+ position of a parent device or exposing relative paths using "../" to
+ access the chain of parents is a bug in the application.
diff --git a/Documentation/sysrq.txt b/Documentation/sysrq.txt
index ba328f255417..ef19142896ca 100644
--- a/Documentation/sysrq.txt
+++ b/Documentation/sysrq.txt
@@ -1,6 +1,6 @@
Linux Magic System Request Key Hacks
Documentation for sysrq.c
-Last update: 2007-MAR-14
+Last update: 2007-AUG-04
* What is the magic SysRq key?
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -78,7 +78,7 @@ On all - write a character to /proc/sysrq-trigger. e.g.:
'g' - Used by kgdb on ppc and sh platforms.
'h' - Will display help (actually any other key than those listed
- above will display help. but 'h' is easy to remember :-)
+ here will display help. but 'h' is easy to remember :-)
'i' - Send a SIGKILL to all processes, except for init.
diff --git a/Documentation/thinkpad-acpi.txt b/Documentation/thinkpad-acpi.txt
index 6711fbcf4080..eb2f5986e1eb 100644
--- a/Documentation/thinkpad-acpi.txt
+++ b/Documentation/thinkpad-acpi.txt
@@ -105,10 +105,10 @@ The version of thinkpad-acpi's sysfs interface is exported by the driver
as a driver attribute (see below).
Sysfs driver attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/bus/platform/drivers/thinkpad-acpi/.
+for 2.6.20 this is /sys/bus/platform/drivers/thinkpad_acpi/.
Sysfs device attributes are on the driver's sysfs attribute space,
-for 2.6.20 this is /sys/devices/platform/thinkpad-acpi/.
+for 2.6.20 this is /sys/devices/platform/thinkpad_acpi/.
Driver version
--------------
diff --git a/Documentation/vm/slabinfo.c b/Documentation/vm/slabinfo.c
index d4f21ffd1404..1af7bd5a2183 100644
--- a/Documentation/vm/slabinfo.c
+++ b/Documentation/vm/slabinfo.c
@@ -396,7 +396,7 @@ void report(struct slabinfo *s)
if (strcmp(s->name, "*") == 0)
return;
- printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %d\n",
+ printf("\nSlabcache: %-20s Aliases: %2d Order : %2d Objects: %lu\n",
s->name, s->aliases, s->order, s->objects);
if (s->hwcache_align)
printf("** Hardware cacheline aligned\n");

Privacy Policy