From 321f0c7fe2078b244a7c11e1bde79b2d348b120f Mon Sep 17 00:00:00 2001 From: Rob Pike Date: Tue, 19 Oct 2010 20:39:29 -0700 Subject: [PATCH] gob: break documentation into a separate doc.go file R=adg, r2 CC=golang-dev https://golang.org/cl/2596041 --- src/pkg/gob/Makefile | 1 + src/pkg/gob/doc.go | 267 ++++++++++++++++++++++++++++++++++++++++++ src/pkg/gob/encode.go | 262 ----------------------------------------- 3 files changed, 268 insertions(+), 262 deletions(-) create mode 100644 src/pkg/gob/doc.go diff --git a/src/pkg/gob/Makefile b/src/pkg/gob/Makefile index ddbea3e686..77ec9d98ce 100644 --- a/src/pkg/gob/Makefile +++ b/src/pkg/gob/Makefile @@ -8,6 +8,7 @@ TARG=gob GOFILES=\ decode.go\ decoder.go\ + doc.go\ encode.go\ encoder.go\ type.go\ diff --git a/src/pkg/gob/doc.go b/src/pkg/gob/doc.go new file mode 100644 index 0000000000..b9f8d492b2 --- /dev/null +++ b/src/pkg/gob/doc.go @@ -0,0 +1,267 @@ +// Copyright 2009 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +/* +The gob package manages streams of gobs - binary values exchanged between an +Encoder (transmitter) and a Decoder (receiver). A typical use is transporting +arguments and results of remote procedure calls (RPCs) such as those provided by +package "rpc". + +A stream of gobs is self-describing. Each data item in the stream is preceded by +a specification of its type, expressed in terms of a small set of predefined +types. Pointers are not transmitted, but the things they point to are +transmitted; that is, the values are flattened. Recursive types work fine, but +recursive values (data with cycles) are problematic. This may change. + +To use gobs, create an Encoder and present it with a series of data items as +values or addresses that can be dereferenced to values. The Encoder makes sure +all type information is sent before it is needed. At the receive side, a +Decoder retrieves values from the encoded stream and unpacks them into local +variables. + +The source and destination values/types need not correspond exactly. For structs, +fields (identified by name) that are in the source but absent from the receiving +variable will be ignored. Fields that are in the receiving variable but missing +from the transmitted type or value will be ignored in the destination. If a field +with the same name is present in both, their types must be compatible. Both the +receiver and transmitter will do all necessary indirection and dereferencing to +convert between gobs and actual Go values. For instance, a gob type that is +schematically, + + struct { a, b int } + +can be sent from or received into any of these Go types: + + struct { a, b int } // the same + *struct { a, b int } // extra indirection of the struct + struct { *a, **b int } // extra indirection of the fields + struct { a, b int64 } // different concrete value type; see below + +It may also be received into any of these: + + struct { a, b int } // the same + struct { b, a int } // ordering doesn't matter; matching is by name + struct { a, b, c int } // extra field (c) ignored + struct { b int } // missing field (a) ignored; data will be dropped + struct { b, c int } // missing field (a) ignored; extra field (c) ignored. + +Attempting to receive into these types will draw a decode error: + + struct { a int; b uint } // change of signedness for b + struct { a int; b float } // change of type for b + struct { } // no field names in common + struct { c, d int } // no field names in common + +Integers are transmitted two ways: arbitrary precision signed integers or +arbitrary precision unsigned integers. There is no int8, int16 etc. +discrimination in the gob format; there are only signed and unsigned integers. As +described below, the transmitter sends the value in a variable-length encoding; +the receiver accepts the value and stores it in the destination variable. +Floating-point numbers are always sent using IEEE-754 64-bit precision (see +below). + +Signed integers may be received into any signed integer variable: int, int16, etc.; +unsigned integers may be received into any unsigned integer variable; and floating +point values may be received into any floating point variable. However, +the destination variable must be able to represent the value or the decode +operation will fail. + +Structs, arrays and slices are also supported. Strings and arrays of bytes are +supported with a special, efficient representation (see below). + +Interfaces, functions, and channels cannot be sent in a gob. Attempting +to encode a value that contains one will fail. + +The rest of this comment documents the encoding, details that are not important +for most users. Details are presented bottom-up. + +An unsigned integer is sent one of two ways. If it is less than 128, it is sent +as a byte with that value. Otherwise it is sent as a minimal-length big-endian +(high byte first) byte stream holding the value, preceded by one byte holding the +byte count, negated. Thus 0 is transmitted as (00), 7 is transmitted as (07) and +256 is transmitted as (FE 01 00). + +A boolean is encoded within an unsigned integer: 0 for false, 1 for true. + +A signed integer, i, is encoded within an unsigned integer, u. Within u, bits 1 +upward contain the value; bit 0 says whether they should be complemented upon +receipt. The encode algorithm looks like this: + + uint u; + if i < 0 { + u = (^i << 1) | 1 // complement i, bit 0 is 1 + } else { + u = (i << 1) // do not complement i, bit 0 is 0 + } + encodeUnsigned(u) + +The low bit is therefore analogous to a sign bit, but making it the complement bit +instead guarantees that the largest negative integer is not a special case. For +example, -129=^128=(^256>>1) encodes as (FE 01 01). + +Floating-point numbers are always sent as a representation of a float64 value. +That value is converted to a uint64 using math.Float64bits. The uint64 is then +byte-reversed and sent as a regular unsigned integer. The byte-reversal means the +exponent and high-precision part of the mantissa go first. Since the low bits are +often zero, this can save encoding bytes. For instance, 17.0 is encoded in only +three bytes (FE 31 40). + +Strings and slices of bytes are sent as an unsigned count followed by that many +uninterpreted bytes of the value. + +All other slices and arrays are sent as an unsigned count followed by that many +elements using the standard gob encoding for their type, recursively. + +Structs are sent as a sequence of (field number, field value) pairs. The field +value is sent using the standard gob encoding for its type, recursively. If a +field has the zero value for its type, it is omitted from the transmission. The +field number is defined by the type of the encoded struct: the first field of the +encoded type is field 0, the second is field 1, etc. When encoding a value, the +field numbers are delta encoded for efficiency and the fields are always sent in +order of increasing field number; the deltas are therefore unsigned. The +initialization for the delta encoding sets the field number to -1, so an unsigned +integer field 0 with value 7 is transmitted as unsigned delta = 1, unsigned value += 7 or (01 0E). Finally, after all the fields have been sent a terminating mark +denotes the end of the struct. That mark is a delta=0 value, which has +representation (00). + +The representation of types is described below. When a type is defined on a given +connection between an Encoder and Decoder, it is assigned a signed integer type +id. When Encoder.Encode(v) is called, it makes sure there is an id assigned for +the type of v and all its elements and then it sends the pair (typeid, encoded-v) +where typeid is the type id of the encoded type of v and encoded-v is the gob +encoding of the value v. + +To define a type, the encoder chooses an unused, positive type id and sends the +pair (-type id, encoded-type) where encoded-type is the gob encoding of a wireType +description, constructed from these types: + + type wireType struct { + s structType; + } + type fieldType struct { + name string; // the name of the field. + id int; // the type id of the field, which must be already defined + } + type commonType { + name string; // the name of the struct type + id int; // the id of the type, repeated for so it's inside the type + } + type structType struct { + commonType; + field []fieldType; // the fields of the struct. + } + +If there are nested type ids, the types for all inner type ids must be defined +before the top-level type id is used to describe an encoded-v. + +For simplicity in setup, the connection is defined to understand these types a +priori, as well as the basic gob types int, uint, etc. Their ids are: + + bool 1 + int 2 + uint 3 + float 4 + []byte 5 + string 6 + wireType 7 + structType 8 + commonType 9 + fieldType 10 + +In summary, a gob stream looks like + + ((-type id, encoding of a wireType)* (type id, encoding of a value))* + +where * signifies zero or more repetitions and the type id of a value must +be predefined or be defined before the value in the stream. +*/ +package gob + +/* +For implementers and the curious, here is an encoded example. Given + type Point {x, y int} +and the value + p := Point{22, 33} +the bytes transmitted that encode p will be: + 1f ff 81 03 01 01 05 50 6f 69 6e 74 01 ff 82 00 01 02 01 01 78 + 01 04 00 01 01 79 01 04 00 00 00 07 ff 82 01 2c 01 42 00 07 ff + 82 01 2c 01 42 00 +They are determined as follows. + +Since this is the first transmission of type Point, the type descriptor +for Point itself must be sent before the value. This is the first type +we've sent on this Encoder, so it has type id 65 (0 through 64 are +reserved). + + 1f // This item (a type descriptor) is 31 bytes long. + ff 81 // The negative of the id for the type we're defining, -65. + // This is one byte (indicated by FF = -1) followed by + // ^-65<<1 | 1. The low 1 bit signals to complement the + // rest upon receipt. + + // Now we send a type descriptor, which is itself a struct (wireType). + // The type of wireType itself is known (it's built in, as is the type of + // all its components), so we just need to send a *value* of type wireType + // that represents type "Point". + // Here starts the encoding of that value. + // Set the field number implicitly to zero; this is done at the beginning + // of every struct, including nested structs. + 03 // Add 3 to field number; now 3 (wireType.structType; this is a struct). + // structType starts with an embedded commonType, which appears + // as a regular structure here too. + 01 // add 1 to field number (now 1); start of embedded commonType. + 01 // add one to field number (now 1, the name of the type) + 05 // string is (unsigned) 5 bytes long + 50 6f 69 6e 74 // wireType.structType.commonType.name = "Point" + 01 // add one to field number (now 2, the id of the type) + ff 82 // wireType.structType.commonType._id = 65 + 00 // end of embedded wiretype.structType.commonType struct + 01 // add one to field number (now 2, the Field array in wireType.structType) + 02 // There are two fields in the type (len(structType.field)) + 01 // Start of first field structure; add 1 to get field number 1: field[0].name + 01 // 1 byte + 78 // structType.field[0].name = "x" + 01 // Add 1 to get field number 2: field[0].id + 04 // structType.field[0].typeId is 2 (signed int). + 00 // End of structType.field[0]; start structType.field[1]; set field number to 0. + 01 // Add 1 to get field number 1: field[1].name + 01 // 1 byte + 79 // structType.field[1].name = "y" + 01 // Add 1 to get field number 2: field[0].id + 04 // struct.Type.field[1].typeId is 2 (signed int). + 00 // End of structType.field[1]; end of structType.field. + 00 // end of wireType.structType structure + 00 // end of wireType structure + +Now we can send the Point value. Again the field number resets to zero: + + 07 // this value is 7 bytes long + ff 82 // the type number, 65 (1 byte (-FF) followed by 65<<1) + 01 // add one to field number, yielding field 1 + 2c // encoding of signed "22" (0x22 = 44 = 22<<1); Point.x = 22 + 01 // add one to field number, yielding field 2 + 42 // encoding of signed "33" (0x42 = 66 = 33<<1); Point.y = 33 + 00 // end of structure + +The type encoding is long and fairly intricate but we send it only once. +If p is transmitted a second time, the type is already known so the +output will be just: + + 07 ff 82 01 2c 01 42 00 + +A single non-struct value at top level is transmitted like a field with +delta tag 0. For instance, a signed integer with value 3 presented as +the argument to Encode will emit: + + 03 04 00 06 + +Which represents: + + 03 // this value is 3 bytes long + 04 // the type number, 2, represents an integer + 00 // tag delta 0 + 06 // value 3 + +*/ diff --git a/src/pkg/gob/encode.go b/src/pkg/gob/encode.go index 55abeaf657..f664214099 100644 --- a/src/pkg/gob/encode.go +++ b/src/pkg/gob/encode.go @@ -2,270 +2,8 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -/* - The gob package manages streams of gobs - binary values exchanged between an - Encoder (transmitter) and a Decoder (receiver). A typical use is transporting - arguments and results of remote procedure calls (RPCs) such as those provided by - package "rpc". - - A stream of gobs is self-describing. Each data item in the stream is preceded by - a specification of its type, expressed in terms of a small set of predefined - types. Pointers are not transmitted, but the things they point to are - transmitted; that is, the values are flattened. Recursive types work fine, but - recursive values (data with cycles) are problematic. This may change. - - To use gobs, create an Encoder and present it with a series of data items as - values or addresses that can be dereferenced to values. The Encoder makes sure - all type information is sent before it is needed. At the receive side, a - Decoder retrieves values from the encoded stream and unpacks them into local - variables. - - The source and destination values/types need not correspond exactly. For structs, - fields (identified by name) that are in the source but absent from the receiving - variable will be ignored. Fields that are in the receiving variable but missing - from the transmitted type or value will be ignored in the destination. If a field - with the same name is present in both, their types must be compatible. Both the - receiver and transmitter will do all necessary indirection and dereferencing to - convert between gobs and actual Go values. For instance, a gob type that is - schematically, - - struct { a, b int } - - can be sent from or received into any of these Go types: - - struct { a, b int } // the same - *struct { a, b int } // extra indirection of the struct - struct { *a, **b int } // extra indirection of the fields - struct { a, b int64 } // different concrete value type; see below - - It may also be received into any of these: - - struct { a, b int } // the same - struct { b, a int } // ordering doesn't matter; matching is by name - struct { a, b, c int } // extra field (c) ignored - struct { b int } // missing field (a) ignored; data will be dropped - struct { b, c int } // missing field (a) ignored; extra field (c) ignored. - - Attempting to receive into these types will draw a decode error: - - struct { a int; b uint } // change of signedness for b - struct { a int; b float } // change of type for b - struct { } // no field names in common - struct { c, d int } // no field names in common - - Integers are transmitted two ways: arbitrary precision signed integers or - arbitrary precision unsigned integers. There is no int8, int16 etc. - discrimination in the gob format; there are only signed and unsigned integers. As - described below, the transmitter sends the value in a variable-length encoding; - the receiver accepts the value and stores it in the destination variable. - Floating-point numbers are always sent using IEEE-754 64-bit precision (see - below). - - Signed integers may be received into any signed integer variable: int, int16, etc.; - unsigned integers may be received into any unsigned integer variable; and floating - point values may be received into any floating point variable. However, - the destination variable must be able to represent the value or the decode - operation will fail. - - Structs, arrays and slices are also supported. Strings and arrays of bytes are - supported with a special, efficient representation (see below). - - Interfaces, functions, and channels cannot be sent in a gob. Attempting - to encode a value that contains one will fail. - - The rest of this comment documents the encoding, details that are not important - for most users. Details are presented bottom-up. - - An unsigned integer is sent one of two ways. If it is less than 128, it is sent - as a byte with that value. Otherwise it is sent as a minimal-length big-endian - (high byte first) byte stream holding the value, preceded by one byte holding the - byte count, negated. Thus 0 is transmitted as (00), 7 is transmitted as (07) and - 256 is transmitted as (FE 01 00). - - A boolean is encoded within an unsigned integer: 0 for false, 1 for true. - - A signed integer, i, is encoded within an unsigned integer, u. Within u, bits 1 - upward contain the value; bit 0 says whether they should be complemented upon - receipt. The encode algorithm looks like this: - - uint u; - if i < 0 { - u = (^i << 1) | 1 // complement i, bit 0 is 1 - } else { - u = (i << 1) // do not complement i, bit 0 is 0 - } - encodeUnsigned(u) - - The low bit is therefore analogous to a sign bit, but making it the complement bit - instead guarantees that the largest negative integer is not a special case. For - example, -129=^128=(^256>>1) encodes as (FE 01 01). - - Floating-point numbers are always sent as a representation of a float64 value. - That value is converted to a uint64 using math.Float64bits. The uint64 is then - byte-reversed and sent as a regular unsigned integer. The byte-reversal means the - exponent and high-precision part of the mantissa go first. Since the low bits are - often zero, this can save encoding bytes. For instance, 17.0 is encoded in only - three bytes (FE 31 40). - - Strings and slices of bytes are sent as an unsigned count followed by that many - uninterpreted bytes of the value. - - All other slices and arrays are sent as an unsigned count followed by that many - elements using the standard gob encoding for their type, recursively. - - Structs are sent as a sequence of (field number, field value) pairs. The field - value is sent using the standard gob encoding for its type, recursively. If a - field has the zero value for its type, it is omitted from the transmission. The - field number is defined by the type of the encoded struct: the first field of the - encoded type is field 0, the second is field 1, etc. When encoding a value, the - field numbers are delta encoded for efficiency and the fields are always sent in - order of increasing field number; the deltas are therefore unsigned. The - initialization for the delta encoding sets the field number to -1, so an unsigned - integer field 0 with value 7 is transmitted as unsigned delta = 1, unsigned value - = 7 or (01 0E). Finally, after all the fields have been sent a terminating mark - denotes the end of the struct. That mark is a delta=0 value, which has - representation (00). - - The representation of types is described below. When a type is defined on a given - connection between an Encoder and Decoder, it is assigned a signed integer type - id. When Encoder.Encode(v) is called, it makes sure there is an id assigned for - the type of v and all its elements and then it sends the pair (typeid, encoded-v) - where typeid is the type id of the encoded type of v and encoded-v is the gob - encoding of the value v. - - To define a type, the encoder chooses an unused, positive type id and sends the - pair (-type id, encoded-type) where encoded-type is the gob encoding of a wireType - description, constructed from these types: - - type wireType struct { - s structType; - } - type fieldType struct { - name string; // the name of the field. - id int; // the type id of the field, which must be already defined - } - type commonType { - name string; // the name of the struct type - id int; // the id of the type, repeated for so it's inside the type - } - type structType struct { - commonType; - field []fieldType; // the fields of the struct. - } - - If there are nested type ids, the types for all inner type ids must be defined - before the top-level type id is used to describe an encoded-v. - - For simplicity in setup, the connection is defined to understand these types a - priori, as well as the basic gob types int, uint, etc. Their ids are: - - bool 1 - int 2 - uint 3 - float 4 - []byte 5 - string 6 - wireType 7 - structType 8 - commonType 9 - fieldType 10 - - In summary, a gob stream looks like - - ((-type id, encoding of a wireType)* (type id, encoding of a value))* - - where * signifies zero or more repetitions and the type id of a value must - be predefined or be defined before the value in the stream. -*/ package gob -/* - For implementers and the curious, here is an encoded example. Given - type Point {x, y int} - and the value - p := Point{22, 33} - the bytes transmitted that encode p will be: - 1f ff 81 03 01 01 05 50 6f 69 6e 74 01 ff 82 00 01 02 01 01 78 - 01 04 00 01 01 79 01 04 00 00 00 07 ff 82 01 2c 01 42 00 07 ff - 82 01 2c 01 42 00 - They are determined as follows. - - Since this is the first transmission of type Point, the type descriptor - for Point itself must be sent before the value. This is the first type - we've sent on this Encoder, so it has type id 65 (0 through 64 are - reserved). - - 1f // This item (a type descriptor) is 31 bytes long. - ff 81 // The negative of the id for the type we're defining, -65. - // This is one byte (indicated by FF = -1) followed by - // ^-65<<1 | 1. The low 1 bit signals to complement the - // rest upon receipt. - - // Now we send a type descriptor, which is itself a struct (wireType). - // The type of wireType itself is known (it's built in, as is the type of - // all its components), so we just need to send a *value* of type wireType - // that represents type "Point". - // Here starts the encoding of that value. - // Set the field number implicitly to zero; this is done at the beginning - // of every struct, including nested structs. - 03 // Add 3 to field number; now 3 (wireType.structType; this is a struct). - // structType starts with an embedded commonType, which appears - // as a regular structure here too. - 01 // add 1 to field number (now 1); start of embedded commonType. - 01 // add one to field number (now 1, the name of the type) - 05 // string is (unsigned) 5 bytes long - 50 6f 69 6e 74 // wireType.structType.commonType.name = "Point" - 01 // add one to field number (now 2, the id of the type) - ff 82 // wireType.structType.commonType._id = 65 - 00 // end of embedded wiretype.structType.commonType struct - 01 // add one to field number (now 2, the Field array in wireType.structType) - 02 // There are two fields in the type (len(structType.field)) - 01 // Start of first field structure; add 1 to get field number 1: field[0].name - 01 // 1 byte - 78 // structType.field[0].name = "x" - 01 // Add 1 to get field number 2: field[0].id - 04 // structType.field[0].typeId is 2 (signed int). - 00 // End of structType.field[0]; start structType.field[1]; set field number to 0. - 01 // Add 1 to get field number 1: field[1].name - 01 // 1 byte - 79 // structType.field[1].name = "y" - 01 // Add 1 to get field number 2: field[0].id - 04 // struct.Type.field[1].typeId is 2 (signed int). - 00 // End of structType.field[1]; end of structType.field. - 00 // end of wireType.structType structure - 00 // end of wireType structure - - Now we can send the Point value. Again the field number resets to zero: - - 07 // this value is 7 bytes long - ff 82 // the type number, 65 (1 byte (-FF) followed by 65<<1) - 01 // add one to field number, yielding field 1 - 2c // encoding of signed "22" (0x22 = 44 = 22<<1); Point.x = 22 - 01 // add one to field number, yielding field 2 - 42 // encoding of signed "33" (0x42 = 66 = 33<<1); Point.y = 33 - 00 // end of structure - - The type encoding is long and fairly intricate but we send it only once. - If p is transmitted a second time, the type is already known so the - output will be just: - - 07 ff 82 01 2c 01 42 00 - - A single non-struct value at top level is transmitted like a field with - delta tag 0. For instance, a signed integer with value 3 presented as - the argument to Encode will emit: - - 03 04 00 06 - - Which represents: - - 03 // this value is 3 bytes long - 04 // the type number, 2, represents an integer - 00 // tag delta 0 - 06 // value 3 - -*/ - import ( "bytes" "io"