commit 751b403e15e6024f60e285a2fe7f988163d8ba15 Author: David Arroyo Date: Sat Jul 13 14:05:31 2013 -0400 Initial commit. Beginnings of an ndb parsing library. Lexer/parser functional. What's left is mapping the input to Go values. diff --git a/README.md b/README.md new file mode 100644 index 0000000..e69de29 diff --git a/decode.go b/decode.go new file mode 100644 index 0000000..faea4ff --- /dev/null +++ b/decode.go @@ -0,0 +1 @@ +package ndb diff --git a/encode.go b/encode.go new file mode 100644 index 0000000..faea4ff --- /dev/null +++ b/encode.go @@ -0,0 +1 @@ +package ndb diff --git a/ndb.go b/ndb.go new file mode 100644 index 0000000..c1a9701 --- /dev/null +++ b/ndb.go @@ -0,0 +1,155 @@ +// Package ndb decodes and encodes simple strings of key=value pairs. +// The accepted format is based on Plan 9's ndb(6) format found at +// http://plan9.bell-labs.com/magic/man2html/6/ndb . Values containing +// white space must be quoted in single quotes. Two single quotes escape +// a literal single quote. Attributes must not contain white space. A +// value may contain any printable unicode character except for a new line. +package ndb + +import ( + "reflect" + "bytes" + "bufio" + "net/textproto" + "fmt" + "io" + "unicode/utf8" +) + +// A SyntaxError contains the data that caused an error and the +// offset of the first byte that caused the syntax error. Data may +// only be valid until the next call to the Decode() method +type SyntaxError struct { + Data []byte + Offset int64 + Message string +} + +// A TypeError occurs when a Go value is incompatible with the ndb +// string it must store or create. +type TypeError struct { + Type reflect.Type +} + +func (e *TypeError) Error() string { + return fmt.Sprintf("Invalid type %s or nil pointer", e.Type.String()) +} + +func min(a,b int64) int64 { + if a < b { + return a + } + return b +} + +func (e *SyntaxError) Error() string { + start := e.Offset + end := min(e.Offset + 10, int64(len(e.Data))) + + // Make sure we're on utf8 boundaries + for !utf8.RuneStart(e.Data[start]) && start > 0 { + start-- + } + for !utf8.Valid(e.Data[start:end]) && end < int64(len(e.Data)) { + end++ + } + + return fmt.Sprintf("%s\n\tat `%s'", e.Message, e.Data[start:end]) +} + +// An Encoder wraps an io.Writer and serializes Go values +// into ndb strings. Successive calls to the Encode() method +// append lines to the io.Writer. +type Encoder struct { + out bufio.Writer +} + +// A decoder wraps an io.Reader and decodes successive ndb strings +// into Go values using the Decode() function. +type Decoder struct { + src *textproto.Reader + pairbuf []pair +} + +// The Parse function reads an entire ndb string and unmarshals it +// into the Go value v. Parse will behave differently depending on +// the concrete type of v. Value v must be a reference type, either a +// pointer, map, or slice. +// +// * If v is a slice, Parse will decode all lines from the ndb +// input into array elements. Otherwise, Parse will decode only +// the first line. +// +// * If v is of the type (map[string] interface{}), Parse will +// populate v with key/value pairs, where value is decoded +// according to the concrete type of the map's value. +// +// * If v is a struct, Parse will populate struct fields whose +// names match the ndb attribute. Struct fields may be annotated +// with a tag of the form `ndb: name`, where name matches the +// attribute string in the ndb input. +// +// Struct fields or map keys that do not match the ndb input are left +// unmodified. Ndb attributes that do not match any struct fields are +// silently dropped. If an ndb string cannot be converted to the +// destination value or a syntax error occurs, an error is returned +// and v is left unmodified. Parse can only store to exported (capitalized) +// fields of a struct. +func Parse(data []byte, v interface{}) error { + d := NewDecoder(bytes.NewReader(data)) + return d.Decode(v) +} + +// NewDecoder returns a Decoder with its input pulled from an io.Reader +func NewDecoder(r io.Reader) *Decoder { + d := new(Decoder) + d.src = textproto.NewReader(bufio.NewReader(r)) + return d +} + +// The Decode method follows the same parsing rules as Parse(), but +// will read at most one ndb string. As such, slices or arrays are +// not valid types for v. +func (d *Decoder) Decode(v interface{}) error { + val := reflect.ValueOf(v) + if val.Kind() != reflect.Ptr || val.IsNil() { + return &TypeError{val.Type()} + } + if p,err := d.getPairs(); err != nil { + return err + } else { + return d.saveData(p, val.Elem()) + } +} + +// Emit encodes a value into an ndb string. Emit will use the String +// method of each struct field or map entry to produce ndb output. +// If v is a slice or array, multiple ndb lines will be output, one +// for each element. For structs, attribute names will be the name of +// the struct field, or the fields ndb annotation if it exists. +// Ndb attributes may not contain white space. Ndb values may contain +// white space but may not contain new lines. If Emit cannot produce +// valid ndb strings, an error is returned. +func Emit(v interface{}) ([]byte, error) { + return nil,nil +} + +// The Encode method will write the ndb encoding of the Go value v +// to its backend io.Writer. Unlike Decode(), slice or array values +// are valid, and will cause multiple ndb lines to be written. +// If the value cannot be fully encoded, an error is returned and +// no data will be written to the io.Writer. +func (e *Encoder) Encode(v interface{}) error { + return nil +} + +// NewEncoder returns an Encoder that writes ndb output to an +// io.Writer +func NewEncoder(w io.Writer) *Encoder { + return nil +} + +// Flush forces all outstanding data in an Encoder to be written to +// its backend io.Writer. +func (e *Encoder) Flush() { +} diff --git a/parse.go b/parse.go new file mode 100644 index 0000000..769ce45 --- /dev/null +++ b/parse.go @@ -0,0 +1,177 @@ +package ndb + +import ( + "reflect" + "net/textproto" + "unicode" + "bytes" + "fmt" +) + +type scanner struct { + src *textproto.Reader +} + +type pair struct { + attr, val []byte +} + +func (p pair) String() string { + return fmt.Sprintf("%s => %s", string(p.attr), string(p.val)) +} + +func errBadAttr(line []byte, offset int64) error { + return &SyntaxError { line, offset, "Invalid attribute name" } +} +func errUnterminated(line []byte, offset int64) error { + return &SyntaxError { line, offset, "Unterminated quoted string" } +} +func errBadUnicode(line []byte, offset int64) error { + return &SyntaxError { line, offset, "Invalid UTF8 input" } +} +func errNewline(line []byte, offset int64) error { + return &SyntaxError { line, offset, "Values may not contain new lines" } +} + +func (d *Decoder) getPairs() ([]pair, error) { + var tuples [][]byte + d.pairbuf = d.pairbuf[0:0] + line, err := d.src.ReadContinuedLineBytes() + if err != nil { + return nil,err + } + tuples,err = lex(line) + if err != nil { + return nil,err + } else { + for _,t := range tuples { + d.pairbuf = append(d.pairbuf, parseTuple(t)) + } + } + return d.pairbuf, nil +} + +func (d *Decoder) saveData(p []pair, val reflect.Value) error { + return nil +} + +func parseTuple(tuple []byte) pair { + var p pair + fmt.Printf("Split %s\n", string(tuple)) + s := bytes.SplitN(tuple, []byte("="), 2) + p.attr = s[0] + if len(s) > 1 { + if len(s[1]) > 1 { + if s[1][0] == '\'' && len(s[1]) > 2 && s[1][len(s[1])-1] == '\'' { + s[1] = s[1][1:len(s[1])-1] + } + } + p.val = bytes.Replace(s[1], []byte("''"), []byte("'"), -1) + } + fmt.Println("Made ", p) + return p +} + +type scanState []int +func (s *scanState) push(n int) { + *s = append(*s, n) +} +func (s scanState) top() int { + if len(s) > 0 { + return s[len(s)-1] + } + return scanNone +} +func (s *scanState) pop() int { + v := s.top() + if len(*s) > 0 { + *s = (*s)[0:len(*s)-1] + } + return v +} + +const ( + scanNone = iota + scanAttr + scanValue + scanValueStart + scanQuoteStart + scanQuoteString +) + +func lex(line []byte) ([][]byte, error) { + var offset int64 + state := make(scanState, 0, 3) + tuples := make([][]byte, 0, 10) + buf := bytes.NewReader(line) + var beg int64 + + for r,sz,err := buf.ReadRune(); err == nil; r,sz,err = buf.ReadRune() { + fmt.Printf("(%d,%c) %s|%s\n", state.top(), r, line[:offset], line[offset:]) + if r == 0xFFFD && sz == 1 { + return nil, errBadUnicode(line, offset) + } + switch state.top() { + case scanNone: + if unicode.IsSpace(r) { + // skip + } else if unicode.IsLetter(r) || unicode.IsNumber(r) { + state.push(scanAttr) + beg = offset + } else { + return nil,errBadAttr(line, offset) + } + case scanAttr: + if unicode.IsSpace(r) { + state.pop() + tuples = append(tuples, line[beg:offset]) + fmt.Println("Save", string(line[beg:offset])) + } else if r == '=' { + state.pop() + state.push(scanValueStart) + } else if !(unicode.IsLetter(r) || unicode.IsNumber(r)) { + return nil,errBadAttr(line, offset) + } + case scanValueStart: + if unicode.IsSpace(r) { + state.pop() + tuples = append(tuples, line[beg:offset]) + fmt.Println("Save", string(line[beg:offset])) + } else if r == '\'' { + state.push(scanQuoteStart) + } else { + state.pop() + state.push(scanValue) + } + case scanValue: + if unicode.IsSpace(r) { + state.pop() + tuples = append(tuples, line[beg:offset]) + fmt.Println("Save", string(line[beg:offset])) + } + case scanQuoteStart: + if r == '\'' { + state.pop() + } else { + state.pop() + state.push(scanQuoteString) + } + case scanQuoteString: + if r == '\'' { + state.pop() + } else if r == '\n' { + return nil,errNewline(line, offset) + } + } + offset += int64(sz) + } + switch state.top() { + case scanQuoteString, scanQuoteStart: + return nil,errUnterminated(line, offset) + case scanNone: + default: + tuples = append(tuples, line[beg:offset]) + fmt.Println("Save", string(line[beg:offset])) + } + return tuples,nil +} diff --git a/parse_test.go b/parse_test.go new file mode 100644 index 0000000..c66e25c --- /dev/null +++ b/parse_test.go @@ -0,0 +1,69 @@ +package ndb + +import ( + "testing" + "bytes" +) + +var parseTests = []struct { + in []byte + out []pair +}{ + { + in: []byte("key1=val1 key2=val2 key3=val3"), + out: []pair { + {[]byte("key1"),[]byte("val1")}, + {[]byte("key2"),[]byte("val2")}, + {[]byte("key3"),[]byte("val3")}}, + }, + { + in: []byte("title='Some value with spaces' width=340 height=200"), + out: []pair { + {[]byte("title"),[]byte("Some value with spaces")}, + {[]byte("width"),[]byte("340")}, + {[]byte("height"),[]byte("200")}}, + }, + { + in: []byte("title='Dave''s pasta' sq=Davis cost=$$"), + out: []pair { + {[]byte("title"),[]byte("Dave's pasta")}, + {[]byte("sq"),[]byte("Davis")}, + {[]byte("cost"),[]byte("$$")}}, + }, + { + in: []byte("action=''bradley key=jay mod=ctrl+alt+shift"), + out: []pair { + {[]byte("action"),[]byte("'bradley")}, + {[]byte("key"),[]byte("jay")}, + {[]byte("mod"),[]byte("ctrl+alt+shift")}}, + }, + { + in: []byte("action=reload key='' mod=ctrl+alt+shift"), + out: []pair { + {[]byte("action"),[]byte("reload")}, + {[]byte("key"),[]byte("'")}, + {[]byte("mod"),[]byte("ctrl+alt+shift")}}, + }, +} + +func Test_parsing(t *testing.T) { + for i,tt := range parseTests { + d := NewDecoder(bytes.NewReader(tt.in)) + p,err := d.getPairs() + if err != nil { + t.Error(err) + t.FailNow() + } else { + for j := range tt.out { + if j > len(p) || !match(p[j],tt.out[j]) { + t.Errorf("%d: getPairs %s => %v, want %v",i, tt.in, p, tt.out) + t.FailNow() + } + } + } + } +} + +func match(p1, p2 pair) bool { + return (bytes.Compare(p1.attr, p2.attr) == 0) && (bytes.Compare(p1.val, p2.val) == 0) +}