commit f36b04125906b8f654a46650d83d7e17caa1e8d5
Author: dsp <dsp@2f30.org>
Date: Tue, 10 Feb 2015 19:15:16 -0700
initial commit
Diffstat:
16 files changed, 9410 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile
@@ -0,0 +1,12 @@
+all: allbin
+
+test:
+ go test ./...
+
+allbin: cmd/archive_server.go
+ go build cmd/archive_server.go
+
+clean:
+ rm -f archive_server
+ go clean
+
diff --git a/README b/README
@@ -0,0 +1,60 @@
+=====================================================================
+ go-bgp by DsP <dsp@2f30.org>
+=====================================================================
+[General]
+go-bgp is a collection of pure golang libraries and tools for:
+reading and writing MRT files
+parsing BGP messages
+exposing archived BGP messages in various formats over RESTful HTTP/2
+
+[Details]
+mrt/
+ Using this module from golang allows you to open a file and then using
+ bufio to set the splitfunc to the provided SplitMrt
+ then using .Scan() you can get the []bytes of each MRT message
+
+
+archive/
+ This module allows the scanning of hierarchical dated dirs
+ that contain either XML encoded files or MRT files.
+ then it exposes an API that allows requests in the form of
+ http://host:port/archive?start=YYYYMMDDHHMMSS&end=YYYYMMDDHHMMSS&type=mrt
+ or
+ http://host:port/archive/conf?range
+ http://host:port/archive/conf?files
+
+monitor/
+ bgp monitor
+
+tests/
+ contain data for unit tests
+
+doc/
+ relevant RFCs
+
+cmd/
+ executable programs
+
+[License]
+Copyright (c) 2015, dsp <dsp@2f30.org>
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without modification,
+are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+this list of conditions and the following disclaimer.
+
+2. Redistributions in binary form must reproduce the above copyright notice,
+this list of conditions and the following disclaimer in the documentation and/or
+other materials provided with the distribution.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
+OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN
+ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
diff --git a/archive/archive.go b/archive/archive.go
@@ -0,0 +1,607 @@
+package archive
+
+import (
+ "errors"
+ "fmt"
+ "log"
+ "net/http"
+ "net/url"
+ //"io/ioutil"
+ //"bytes"
+ "bufio"
+ "compress/bzip2"
+ mrt "go-bgp/mrt"
+ "io"
+ "os"
+ "path/filepath"
+ "sort"
+ "strconv"
+ "strings"
+ "sync"
+ "time"
+ "unicode"
+)
+
+const (
+ GET = "GET"
+ PUT = "PUT"
+ POST = "POST"
+ DELETE = "DELETE"
+)
+
+var (
+ errbadreq = errors.New("malformed request")
+ errbaddate = errors.New("dates should be in a YYYYMMDDHHMM format and start should be earlier than end")
+ errempty = errors.New("archive empty")
+ errdate = errors.New("no such date in archive")
+)
+
+type Resource interface {
+ Get(url.Values) (int, chan reply)
+ Put(url.Values) (int, chan reply)
+ Post(url.Values) (int, chan reply)
+ Delete(url.Values) (int, chan reply)
+}
+
+type (
+ GetNotAllowed struct{}
+ PutNotAllowed struct{}
+ PostNotAllowed struct{}
+ DeleteNotAllowed struct{}
+)
+
+func (GetNotAllowed) Get(vals url.Values) (int, chan reply) {
+ return 405, nil
+}
+
+func (PutNotAllowed) Put(vals url.Values) (int, chan reply) {
+ return 405, nil
+}
+func (PostNotAllowed) Post(vals url.Values) (int, chan reply) {
+ return 405, nil
+}
+func (DeleteNotAllowed) Delete(vals url.Values) (int, chan reply) {
+ return 405, nil
+}
+
+type API struct{}
+
+func (api *API) requestHandlerFunc(resource Resource) http.HandlerFunc {
+ return func(rw http.ResponseWriter, req *http.Request) {
+ var (
+ datac chan reply
+ code int
+ )
+ req.ParseForm()
+ method := req.Method
+ vals := req.Form
+ switch method {
+ case GET:
+ code, datac = resource.Get(vals)
+ case PUT:
+ code, datac = resource.Put(vals)
+ case POST:
+ code, datac = resource.Post(vals)
+ case DELETE:
+ code, datac = resource.Delete(vals)
+ }
+ rw.WriteHeader(code)
+ if datac != nil { // we got a proper channel to get datafrom
+ //go func(dc <-chan reply) { // fire a goroutine that will end upon the chan getting closed
+ for r := range datac {
+ if r.err == nil {
+ rw.Write(r.data)
+ } else {
+ log.Printf("Error in received from data channel:%s\n", r.err)
+ rw.Write([]byte(fmt.Sprintf("%s\n", r.err)))
+ }
+ }
+ //}(datac)
+ }
+ }
+}
+
+func (api *API) AddResource(resource Resource, path string) {
+ http.HandleFunc(path, api.requestHandlerFunc(resource))
+}
+
+func (api *API) Start(port int) {
+ portstr := fmt.Sprintf(":%d", port)
+ http.ListenAndServe(portstr, nil)
+}
+
+type reply struct {
+ data []byte
+ err error
+}
+
+//To perform a query asynchronously on possibly many files we fire multiple goroutines
+//that all write their results to chan reply, and we also need the waitgroup
+//to know when we should close the channel to end the http transaction
+type archive interface {
+ Query(time.Time, time.Time, chan reply, *sync.WaitGroup)
+ visit(string, os.FileInfo, error) error
+}
+
+type xmlstring struct {
+ timestr string
+ msg string
+ time time.Time
+}
+
+func (x *xmlstring) String() string {
+ return x.msg
+}
+
+//implements Sort interface by time.Time
+type archentryfile struct {
+ path string
+ sdate time.Time
+ sz int64
+}
+
+type timeentryslice []archentryfile
+
+func (p timeentryslice) Len() int {
+ return len(p)
+}
+
+func (p timeentryslice) Less(i, j int) bool {
+ return p[i].sdate.Before(p[j].sdate)
+}
+
+func (p timeentryslice) Swap(i, j int) {
+ p[i], p[j] = p[j], p[i]
+}
+
+type fsarchive struct {
+ rootpathstr string
+ entryfiles *timeentryslice
+ tempentryfiles timeentryslice
+ curyr int
+ curmon int
+ curday int
+ reqchan chan string
+ scanning bool
+ Scanwg *sync.WaitGroup // expose it so callers are able to wait for scan to finish
+ scanch chan struct{}
+ timedelta time.Duration
+ descriminator string
+ conf *fsarconf
+ //present tha archive as a restful resource
+ PutNotAllowed
+ PostNotAllowed
+ DeleteNotAllowed
+}
+
+type mrtarchive struct {
+ *fsarchive
+}
+
+type xmlarchive struct {
+ *fsarchive
+}
+
+type fsarconf struct {
+ arfiles *timeentryslice
+ PutNotAllowed
+ PostNotAllowed
+ DeleteNotAllowed
+}
+
+//in order not to block in gets, we need to
+//fire a new goroutine to send the reply on the channel
+// the reason is that we create the channel here and we must
+//return it to the responsewriter and any sends would block
+//without the receiver being ready.
+func (fsc *fsarconf) Get(values url.Values) (int, chan reply) {
+ retc := make(chan reply)
+ go func() {
+ defer close(retc) //must close the chan to let the listener finish.
+ if fsc.arfiles == nil {
+ log.Printf("nil arfile in fsarconf. ignoring request\n")
+ return
+ }
+ if _, ok := values["range"]; ok {
+ if len(*fsc.arfiles) > 0 {
+ f := *fsc.arfiles
+ dates := fmt.Sprintf("%s - %s\n", f[0].sdate, f[len(f)-1].sdate)
+ retc <- reply{data: []byte(dates), err: nil}
+ return
+ }
+ retc <- reply{data: nil, err: errempty}
+ return
+ }
+ if _, ok := values["files"]; ok {
+ for _, f := range *fsc.arfiles {
+ retc <- reply{data: []byte(fmt.Sprintf("%s\n", filepath.Base(f.path))), err: nil}
+ }
+ return
+ }
+ return
+ }()
+ return 200, retc
+}
+
+func (fsa *fsarchive) GetImpl(values url.Values, ar archive) (int, chan reply) {
+ var grwg sync.WaitGroup
+ retc := make(chan reply)
+ timeAstrs, ok1 := values["start"]
+ timeBstrs, ok2 := values["end"]
+ if len(timeAstrs) != len(timeBstrs) || !ok1 || !ok2 {
+ retc <- reply{data: nil, err: errbadreq}
+ goto done
+ }
+ for i := 0; i < len(timeAstrs); i++ {
+ log.Printf("timeAstr:%s timeBstr:%s", timeAstrs[i], timeBstrs[i])
+ timeA, errtime := time.Parse("200601021504", timeAstrs[i])
+ timeB, errtime := time.Parse("200601021504", timeBstrs[i])
+ if errtime != nil || timeB.Before(timeA) {
+ retc <- reply{data: nil, err: errbaddate}
+ } else {
+ //buf.WriteString(fmt.Sprintf("quering from t0:%s - t1:%s\n", timeA, timeB))
+ ar.Query(timeA, timeB, retc, &grwg) //this will fire a new goroutine
+ }
+ }
+ // the last goroutine that will wait for all we invoked and close the chan
+ go func(wg *sync.WaitGroup) {
+ wg.Wait() //wait for all the goroutines to finish sending
+ close(retc) //close the chan so that range in responsewriter will finish
+ log.Printf("closing the chan\n")
+ }(&grwg)
+done:
+ return 200, retc
+}
+
+func (fsa *mrtarchive) Get(values url.Values) (int, chan reply) {
+ return fsa.fsarchive.GetImpl(values, fsa)
+}
+
+func (fsa *xmlarchive) Get(values url.Values) (int, chan reply) {
+ return fsa.fsarchive.GetImpl(values, fsa)
+}
+
+func (ma *mrtarchive) Query(ta, tb time.Time, retc chan reply, wg *sync.WaitGroup) {
+ log.Printf("querying mrt from %s to %s\n", ta, tb)
+ go func(rc chan<- reply) {
+ wg.Add(1)
+ ef := *ma.entryfiles
+ var scanner *bufio.Scanner
+ defer wg.Done()
+ if len(ef) == 0 {
+ rc <- reply{nil, errempty}
+ return
+ }
+ if tb.Before(ef[0].sdate) || ta.After(ef[len(ef)-1].sdate.Add(ma.timedelta)) {
+ rc <- reply{nil, errdate}
+ return
+ }
+ i := sort.Search(len(ef), func(i int) bool {
+ return ef[i].sdate.After(ta.Add(-ma.timedelta - time.Second))
+ })
+ j := sort.Search(len(ef), func(i int) bool {
+ return ef[i].sdate.After(tb)
+ })
+ for k := i; k < j; k++ {
+ fext := filepath.Ext(ef[k].path)
+ file, ferr := os.Open(ef[k].path)
+ if ferr != nil {
+ log.Println("failed opening file: ", ef[k].path, " ", ferr)
+ continue
+ }
+ if fext == ".bz2" {
+ log.Printf("bunzip2 file. opening decompression stream\n")
+ bzreader := bzip2.NewReader(file)
+ scanner = bufio.NewScanner(bzreader)
+ scanner.Split(mrt.SplitMrt)
+ } else {
+ log.Printf("no extension on file: %s. opening normally\n", ef[k].path)
+ scanner = bufio.NewScanner(file)
+ scanner.Split(mrt.SplitMrt)
+ }
+ //buf.WriteString(fmt.Sprintf(" [ file: %s ] ", ef[k].path))
+ startt := time.Now()
+ for scanner.Scan() {
+ data := scanner.Bytes()
+ hdr, errh := mrt.NewMrtHdr(data[:mrt.MrtHdr_size])
+ if errh != nil {
+ log.Printf("error in creating MRT header:%s", errh)
+ rc <- reply{data: nil, err: errh}
+ continue
+ }
+ date := time.Unix(int64(hdr.Mrt_timestamp), 0)
+ log.Printf("scanned mrt with date:%s", date)
+ /*
+ dateindi := strings.Index(str, "<DATETIME>")
+ if dateindi == -1 {
+ log.Println("could not locate DATETIME string in xml msg: ", str)
+ continue
+ }
+ dateindi = dateindi + 10 // go to start of date data
+ dateindj := strings.Index(str[dateindi:], "</DATETIME>")
+ if dateindj == -1 {
+ log.Println("could not locate closing </DATETIME> string in xml msg: ", str)
+ continue
+ }
+ dateindj = dateindj + dateindi // to return it to the relative start of line pos
+ xmldate, derr := time.Parse(time.RFC3339, str[dateindi:dateindj])
+ if derr != nil {
+ log.Println("could not parse datetime: %s\n", derr)
+ continue
+ }
+ //log.Printf("parse xml message date: %s\n", xmldate)
+ if xmldate.After(ta) && xmldate.Before(tb) {
+ //buf.WriteString(fmt.Sprintf("%s\n",str))
+ rc <- reply{data: []byte(fmt.Sprintf("%s\n", str)), err: nil}
+ } else if xmldate.After(tb) { //only later measurements in this file. leaving
+ break
+ }*/
+ }
+ if err := scanner.Err(); err != nil && err != io.EOF {
+ log.Printf("file scanner error:%s\n", err)
+ }
+ log.Printf("finished parsing file %s size %d in %s\n", ef[k].path, ef[k].sz, time.Since(startt))
+ file.Close()
+ }
+ return
+ }(retc)
+}
+
+func (fsa *mrtarchive) visit(path string, f os.FileInfo, err error) error {
+ fname := f.Name()
+ log.Print("examining mrt: ", fname)
+ if strings.LastIndex(path, fsa.descriminator) == -1 {
+ log.Printf("visit: descriminator:%s not found in path:%s . ignoring\n", fsa.descriminator, path)
+ return nil
+ }
+ if f.Mode().IsRegular() {
+ numind := strings.IndexFunc(fname, unicode.IsDigit)
+ extind := strings.LastIndex(fname, ".bz2")
+ if numind == -1 || extind == -1 || extind-numind != 13 {
+ log.Print("file: ", fname, " not in foo.YYYYMMDD.HHMM.bz2... format. extind:%d numberind:%d", extind, numind)
+ return nil
+ }
+ datestr := fname[numind:extind]
+ log.Println("datestr in filename is ", datestr)
+ time, errtime := time.Parse("20060102.1504", datestr)
+ if errtime != nil {
+ log.Print("time.Parse() failed on file: ", fname, " that should be in fooHHMM format with error: ", errtime)
+ return nil
+ }
+ fsa.tempentryfiles = append(fsa.tempentryfiles, archentryfile{path: path, sdate: time, sz: f.Size()})
+ }
+ return nil
+}
+
+func (fsa *xmlarchive) Query(ta, tb time.Time, retc chan reply, wg *sync.WaitGroup) {
+ log.Printf("querying from %s to %s\n", ta, tb)
+ go func(rc chan<- reply) {
+ wg.Add(1)
+ defer wg.Done()
+ ef := *fsa.entryfiles
+ var scanner *bufio.Scanner
+ if len(ef) == 0 {
+ rc <- reply{nil, errempty}
+ return
+ }
+ if tb.Before(ef[0].sdate) || ta.After(ef[len(ef)-1].sdate.Add(fsa.timedelta)) {
+ rc <- reply{nil, errdate}
+ return
+ }
+ i := sort.Search(len(ef), func(i int) bool {
+ return ef[i].sdate.After(ta.Add(-fsa.timedelta - time.Second))
+ })
+ j := sort.Search(len(ef), func(i int) bool {
+ return ef[i].sdate.After(tb)
+ })
+ for k := i; k < j; k++ {
+ fext := filepath.Ext(ef[k].path)
+ file, ferr := os.Open(ef[k].path)
+ if ferr != nil {
+ log.Println("failed opening file: ", ef[k].path, " ", ferr)
+ continue
+ }
+ if fext == "" || fext == ".xml" {
+ log.Printf("no extension on file: %s. opening normally\n", ef[k].path)
+ scanner = bufio.NewScanner(file)
+ } else if fext == ".bz2" {
+ log.Printf("bunzip2 file. opening decompression stream\n")
+ bzreader := bzip2.NewReader(file)
+ scanner = bufio.NewScanner(bzreader)
+ } else {
+ log.Printf("unhandled file extension: %s\n", ef[j].path)
+ continue
+ }
+ //buf.WriteString(fmt.Sprintf(" [ file: %s ] ", ef[k].path))
+ startt := time.Now()
+ for scanner.Scan() {
+ str := scanner.Text()
+ dateindi := strings.Index(str, "<DATETIME>")
+ if dateindi == -1 {
+ log.Println("could not locate DATETIME string in xml msg: ", str)
+ continue
+ }
+ dateindi = dateindi + 10 // go to start of date data
+ dateindj := strings.Index(str[dateindi:], "</DATETIME>")
+ if dateindj == -1 {
+ log.Println("could not locate closing </DATETIME> string in xml msg: ", str)
+ continue
+ }
+ dateindj = dateindj + dateindi // to return it to the relative start of line pos
+ xmldate, derr := time.Parse(time.RFC3339, str[dateindi:dateindj])
+ if derr != nil {
+ log.Println("could not parse datetime: %s\n", derr)
+ continue
+ }
+ //log.Printf("parse xml message date: %s\n", xmldate)
+ if xmldate.After(ta) && xmldate.Before(tb) {
+ //buf.WriteString(fmt.Sprintf("%s\n",str))
+ rc <- reply{data: []byte(fmt.Sprintf("%s\n", str)), err: nil}
+ } else if xmldate.After(tb) { //only later measurements in this file. leaving
+ break
+ }
+ }
+ if err := scanner.Err(); err != nil && err != io.EOF {
+ log.Printf("file scanner error:%s\n", err)
+ }
+ log.Printf("finished parsing file %s size %d in %s\n", ef[k].path, ef[k].sz, time.Since(startt))
+ file.Close()
+ }
+ return
+ }(retc)
+}
+
+func NewMRTArchive(path, descr string) *mrtarchive {
+ return &mrtarchive{NewFsArchive(path, descr)}
+}
+
+func NewFsArchive(path, descr string) *fsarchive {
+ return &fsarchive{
+ rootpathstr: path,
+ entryfiles: &timeentryslice{},
+ tempentryfiles: timeentryslice{},
+ curyr: 0,
+ curmon: 0,
+ curday: 0,
+ reqchan: make(chan string),
+ scanning: false,
+ Scanwg: &sync.WaitGroup{},
+ scanch: make(chan struct{}),
+ timedelta: 15 * time.Minute,
+ descriminator: descr,
+ conf: &fsarconf{},
+ }
+}
+
+func NewXmlArchive(path, descr string) *xmlarchive {
+ return &xmlarchive{NewFsArchive(path, descr)}
+}
+
+//trying to see if a dir name is in YYYY.MM form
+//returns true, year, month if it is, or false, 0, 0 if not.
+func isYearMonthDir(fname string) (res bool, yr int, mon int) {
+ var err error
+ res = false
+ yr = 0
+ mon = 0
+ isdot := func(r rune) bool {
+ if r == '.' {
+ return true
+ }
+ return false
+ }
+ ind := strings.IndexFunc(fname, isdot)
+ //not found or in the form foo.
+ if ind == -1 || ind == len(fname) {
+ return
+ }
+ //not YYYY or MM
+ if len(fname[:ind]) != 4 || len(fname[ind+1:]) != 2 {
+ return
+ }
+ yr, err = strconv.Atoi(fname[:ind])
+ if err != nil {
+ return
+ }
+ mon, err = strconv.Atoi(fname[ind+1:])
+ if err != nil {
+ return
+ }
+ if mon < 1 || mon > 12 {
+ return
+ }
+ //the values were found to be valid
+ res = true
+ return
+}
+
+func (fsa *xmlarchive) visit(path string, f os.FileInfo, err error) error {
+ fname := f.Name()
+ log.Print("examining ", fname)
+ if strings.LastIndex(path, fsa.descriminator) == -1 {
+ log.Printf("visit: descriminator:%s not found in path:%s . ignoring\n", fsa.descriminator, path)
+ return nil
+ }
+
+ if f.Mode().IsRegular() {
+ numind := strings.IndexFunc(fname, unicode.IsDigit)
+ xmlind := strings.LastIndex(fname, ".xml")
+ if numind == -1 || xmlind == -1 || xmlind-numind != 13 {
+ log.Print("file: ", fname, " not in foo.YYYYMMDD.HHMM.xml... format")
+ return nil
+ }
+ datestr := fname[numind:xmlind]
+ log.Println("datestr in filename is ", datestr)
+ time, errtime := time.Parse("20060102.1504", datestr)
+ if errtime != nil {
+ log.Print("time.Parse() failed on file: ", fname, " that should be in fooHHMM format with error: ", errtime)
+ return nil
+ }
+ fsa.tempentryfiles = append(fsa.tempentryfiles, archentryfile{path: path, sdate: time, sz: f.Size()})
+ }
+ return nil
+}
+
+func (fsa *fsarchive) printEntries() {
+ log.Printf("dumping entries")
+ for _, ef := range *fsa.entryfiles {
+ fmt.Printf("%s %s\n", ef.path, ef.sdate)
+ }
+}
+
+func (fsa *fsarchive) scan(ar archive) {
+ //clear the temp slice
+ fsa.tempentryfiles = []archentryfile{}
+ fsa.Scanwg.Add(1)
+ fsa.scanning = true
+ filepath.Walk(fsa.rootpathstr, ar.visit)
+ sort.Sort(fsa.tempentryfiles)
+ //allow the serve goroutine to unblock in case of STOP.
+ fsa.Scanwg.Done()
+ //signal the serve goroutine on scandone channel
+ fsa.scanch <- struct{}{}
+}
+
+func (fsa *fsarchive) Serve(wg *sync.WaitGroup, ar archive) (reqchan chan<- string) {
+ if fsa.reqchan == nil { // we have closed the channel and now called again
+ fsa.reqchan = make(chan string)
+ }
+ wg.Add(1)
+ go func() {
+ defer wg.Done()
+ for {
+ select {
+ case req := <-fsa.reqchan:
+ switch req {
+ case "SCAN":
+ if fsa.scanning {
+ log.Print("fsarchive: already scanning. ignoring command")
+ } else { //fire an async goroutine to scan the files and wait for SCANDONE
+ go fsa.scan(ar)
+ }
+ case "DUMPENTRIES":
+ if fsa.scanning {
+ log.Print("fsar: warning. scanning in progress")
+ }
+ fsa.printEntries()
+ case "STOP":
+ log.Print("fsar: stopping")
+ fsa.Scanwg.Wait()
+ fsa.reqchan = nil //no more stuff from this channel
+ return
+ default:
+ log.Print("fsarchive: unknown request: ", req)
+ }
+ case <-fsa.scanch:
+ //update the reference to our file slice
+ fsa.entryfiles = &fsa.tempentryfiles
+ fsa.scanning = false
+ //let the config know
+ log.Printf("setting conf arfiles from :%v to a slice of len: %v\n", fsa.conf.arfiles, len(*fsa.entryfiles))
+ fsa.conf.arfiles = fsa.entryfiles
+ log.Print("fsarchive: scan finished")
+ }
+ }
+ }()
+ return fsa.reqchan
+}
diff --git a/bgp.go b/bgp.go
@@ -0,0 +1 @@
+package bgp
diff --git a/cmd/archive_server.go b/cmd/archive_server.go
@@ -0,0 +1,51 @@
+package main
+
+import (
+ ar "go-bgp/archive"
+ "log"
+ "os"
+ "sync"
+)
+
+func main() {
+ if len(os.Args) != 2 {
+ log.Fatal("usage: ", os.Args[0], " directory ")
+ }
+ basedirstr := os.Args[1]
+ ribmrtar := ar.NewMRTArchive(basedirstr, "RIBS")
+ wg1 := &sync.WaitGroup{}
+ mrtreqc := ribmrtar.Serve(wg1, ribmrtar)
+ mrtreqc <- "SCAN"
+ ribmrtar.Scanwg.Wait()
+ api := new(ar.API)
+ api.AddResource(ribmrtar, "/archive/mrt/ribs")
+ api.Start(3000)
+ close(mrtreqc)
+ wg1.Wait()
+ /*
+ updfsar := NewXmlArchive(basedirstr, "UPDATES")
+ //ribfsar := NewFsArchive(basedirstr, "RIBS")
+ wg2 := &sync.WaitGroup{}
+ updreqc := updfsar.serve(wg2, updfsar)
+ //ribreqc := updfsar.serve(wg2)
+ updreqc <- "SCAN"
+ updfsar.scanwg.Wait()
+ //ribfsar.scanwg.Wait()
+ //time.Sleep(time.Second*2)
+ updreqc <- "DUMPENTRIES"
+ api := new(API)
+ api.AddResource(updfsar, "/archive/updates")
+ //api.AddResource(ribfsar, "/archive/ribs")
+ api.AddResource(updfsar.conf, "/archive/updates/conf")
+ //api.AddResource(ribfsar.conf, "/archive/ribs/conf")
+ api.Start(3000)
+ //reqc<-"STOP"
+
+ close(updreqc)
+ //close(ribreqc)
+ //wait for it
+ //wg1.Wait()
+ wg2.Wait()
+ */
+ log.Print("all fsarchives stopped. exiting")
+}
diff --git a/doc/draft-ietf-grow-mrt-11.txt b/doc/draft-ietf-grow-mrt-11.txt
@@ -0,0 +1,1625 @@
+
+
+
+Network Working Group L. Blunk
+Internet-Draft M. Karir
+Intended status: Standards Track Merit Network
+Expires: September 9, 2010 C. Labovitz
+ Arbor Networks
+ March 8, 2010
+
+
+ MRT routing information export format
+ draft-ietf-grow-mrt-11.txt
+
+Abstract
+
+ This document describes the MRT format for routing information
+ export. This format was developed in concert with the Multi-threaded
+ Routing Toolkit (MRT) from whence the format takes it name. The
+ format can be used to export routing protocol messages, state
+ changes, and routing information base contents.
+
+Status of this Memo
+
+ This Internet-Draft is submitted to IETF in full conformance with the
+ provisions of BCP 78 and BCP 79.
+
+ Internet-Drafts are working documents of the Internet Engineering
+ Task Force (IETF), its areas, and its working groups. Note that
+ other groups may also distribute working documents as Internet-
+ Drafts.
+
+ Internet-Drafts are draft documents valid for a maximum of six months
+ and may be updated, replaced, or obsoleted by other documents at any
+ time. It is inappropriate to use Internet-Drafts as reference
+ material or to cite them other than as "work in progress."
+
+ The list of current Internet-Drafts can be accessed at
+ http://www.ietf.org/ietf/1id-abstracts.txt.
+
+ The list of Internet-Draft Shadow Directories can be accessed at
+ http://www.ietf.org/shadow.html.
+
+ This Internet-Draft will expire on September 9, 2010.
+
+Copyright Notice
+
+ Copyright (c) 2010 IETF Trust and the persons identified as the
+ document authors. All rights reserved.
+
+ This document is subject to BCP 78 and the IETF Trust's Legal
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 1]
+
+Internet-Draft MRT Format March 2010
+
+
+ Provisions Relating to IETF Documents
+ (http://trustee.ietf.org/license-info) in effect on the date of
+ publication of this document. Please review these documents
+ carefully, as they describe your rights and restrictions with respect
+ to this document. Code Components extracted from this document must
+ include Simplified BSD License text as described in Section 4.e of
+ the Trust Legal Provisions and are provided without warranty as
+ described in the BSD License.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 2]
+
+Internet-Draft MRT Format March 2010
+
+
+Table of Contents
+
+ 1. Requirements notation . . . . . . . . . . . . . . . . . . . . 4
+ 2. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 5
+ 3. Basic MRT Format . . . . . . . . . . . . . . . . . . . . . . . 6
+ 4. MRT Informational Types . . . . . . . . . . . . . . . . . . . 8
+ 4.1. START Type . . . . . . . . . . . . . . . . . . . . . . . . 8
+ 4.2. I_AM_DEAD Type . . . . . . . . . . . . . . . . . . . . . . 8
+ 5. MRT Routing Information Types . . . . . . . . . . . . . . . . 9
+ 5.1. OSPF Type . . . . . . . . . . . . . . . . . . . . . . . . 9
+ 5.2. TABLE_DUMP Type . . . . . . . . . . . . . . . . . . . . . 10
+ 5.3. TABLE_DUMP_V2 Type . . . . . . . . . . . . . . . . . . . . 11
+ 5.4. BGP4MP Type . . . . . . . . . . . . . . . . . . . . . . . 14
+ 5.4.1. BGP4MP_STATE_CHANGE Subtype . . . . . . . . . . . . . 14
+ 5.4.2. BGP4MP_MESSAGE Subtype . . . . . . . . . . . . . . . . 15
+ 5.4.3. BGP4MP_MESSAGE_AS4 Subtype . . . . . . . . . . . . . . 16
+ 5.4.4. BGP4MP_STATE_CHANGE_AS4 Subtype . . . . . . . . . . . 16
+ 5.4.5. BGP4MP_MESSAGE_LOCAL Subtype . . . . . . . . . . . . . 17
+ 5.4.6. BGP4MP_MESSAGE_AS4_LOCAL Subtype . . . . . . . . . . . 17
+ 5.5. BGP4MP_ET Type . . . . . . . . . . . . . . . . . . . . . . 17
+ 5.6. ISIS Type . . . . . . . . . . . . . . . . . . . . . . . . 18
+ 5.7. ISIS_ET Type . . . . . . . . . . . . . . . . . . . . . . . 18
+ 5.8. OSPFv3 Type . . . . . . . . . . . . . . . . . . . . . . . 18
+ 5.9. OSPFv3_ET Type . . . . . . . . . . . . . . . . . . . . . . 19
+ 6. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 20
+ 6.1. Type Codes . . . . . . . . . . . . . . . . . . . . . . . . 20
+ 6.2. Subtype Codes . . . . . . . . . . . . . . . . . . . . . . 20
+ 7. Security Considerations . . . . . . . . . . . . . . . . . . . 21
+ 8. References . . . . . . . . . . . . . . . . . . . . . . . . . . 22
+ 8.1. Normative References . . . . . . . . . . . . . . . . . . . 22
+ 8.2. Informative References . . . . . . . . . . . . . . . . . . 22
+ Appendix A. Deprecated MRT types . . . . . . . . . . . . . . . . 23
+ A.1. Deprecated MRT Informational Types . . . . . . . . . . . . 23
+ A.1.1. NULL Type . . . . . . . . . . . . . . . . . . . . . . 23
+ A.1.2. DIE Type . . . . . . . . . . . . . . . . . . . . . . . 23
+ A.1.3. PEER_DOWN Type . . . . . . . . . . . . . . . . . . . . 23
+ A.2. Deprecated MRT Routing Information Types . . . . . . . . . 23
+ A.2.1. BGP Type . . . . . . . . . . . . . . . . . . . . . . . 23
+ A.2.2. RIP Type . . . . . . . . . . . . . . . . . . . . . . . 26
+ A.2.3. IDRP Type . . . . . . . . . . . . . . . . . . . . . . 26
+ A.2.4. RIPNG Type . . . . . . . . . . . . . . . . . . . . . . 26
+ A.2.5. BGP4PLUS and BGP4PLUS_01 Types . . . . . . . . . . . . 27
+ A.2.6. Deprecated BGP4MP Subtypes . . . . . . . . . . . . . . 27
+ Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . . 29
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 3]
+
+Internet-Draft MRT Format March 2010
+
+
+1. Requirements notation
+
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+ "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+ document are to be interpreted as described in [RFC2119].
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 4]
+
+Internet-Draft MRT Format March 2010
+
+
+2. Introduction
+
+ Researchers and engineers often wish to analyze network behavior by
+ studying routing protocol transactions and routing information base
+ snapshots. To this end, the MRT format was developed to encapsulate,
+ export, and archive this information in a standardized data
+ representation. The BGP routing protocol, in particular, has been
+ the subject of extensive study and analysis which has been
+ significantly aided by the availability of the MRT format. The MRT
+ format was initially defined in the MRT Programmer's Guide [MRT PROG
+ GUIDE].
+
+ This memo serves to document the MRT format as currently implemented
+ in publicly available software. The format has been extended since
+ it's original introduction in the MRT toolset and these extensions
+ are also included in this memo. Further extensions may be introduced
+ at a later date through additional definitions of the MRT Type field
+ and Subtype fields.
+
+ A number of MRT message types have been documented in some references
+ but are not known to have been implemented. Further, several types
+ were employed in early MRT implementations, but are no longer
+ actively being used. These types are considered to be deprecated and
+ are documented in a separate appendix at the end of this document.
+ Some of the deprecated types may of interest to researchers examining
+ historical MRT archives.
+
+ Fields which contain multi-octet numeric values are encoded in
+ network octet order from most significant octet to least significant
+ octet. Fields which contain routing message fields are encoded in
+ the same order as they appear in the packet contents.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 5]
+
+Internet-Draft MRT Format March 2010
+
+
+3. Basic MRT Format
+
+ All MRT format messages have a common header which includes a
+ timestamp, Type, Subtype, and length field. The header is followed
+ by a message field. The MRT common header is illustrated below.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Timestamp |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Subtype |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Message... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Header Field Descriptions:
+
+
+ Timestamp:
+
+ Time in seconds since 1 January 1970 00:00:00 UTC
+
+
+ Type:
+
+ A 2-octet field that indicates the Type of information
+ contained in the message field. Types 0 through 4 are
+ informational messages pertaining to the state of an MRT
+ collector, while Types 5 and higher are used to convey routing
+ information.
+
+
+ Subtype:
+
+ A 2-octet field that is used to further distinguish message
+ information within a particular message Type.
+
+
+ Length:
+
+ A 4-octet message length field. The length field contains the
+ number of octets within the message. The length field does not
+ include the length of the MRT common header.
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 6]
+
+Internet-Draft MRT Format March 2010
+
+
+
+ Message:
+
+ A variable length message. The contents of this field are
+ context dependent upon the Type and Subtype fields.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 7]
+
+Internet-Draft MRT Format March 2010
+
+
+4. MRT Informational Types
+
+ The MRT format defines five Informational Type messages. These
+ messages are intended to signal the state of an MRT data collector
+ and do not contain routing information. These messages are OPTIONAL
+ and were largely intended for use when MRT messages are sent over a
+ network to a remote repository store. However, MRT message
+ repository stores have traditionally resided on the same device as
+ the collector and these Informational Types have seen limited
+ implementation. Further, transport mechanisms for MRT messages are
+ considered to be outside the scope of this document.
+
+ The START and I_AM_DEAD messages MAY be used to provide a time
+ reference when a data collector begins and ends the collection
+ process. The time reference is obtained from the Timestamp field in
+ the MRT message header.
+
+ The message field MAY contain an OPTIONAL message string for
+ diagnostic purposes. The message string encoding MUST follow the
+ UTF-8 transformation format. The Subtype field is unused for these
+ Types and SHOULD be set to 0.
+
+ The MRT Informational Types are defined below:
+
+ 1 START
+ 3 I_AM_DEAD
+
+4.1. START Type
+
+ The START Type indicates a collector is about to begin generating MRT
+ messages.
+
+4.2. I_AM_DEAD Type
+
+ An I_AM_DEAD MRT message indicates that a collector has shut down and
+ has stopped generating MRT messages.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 8]
+
+Internet-Draft MRT Format March 2010
+
+
+5. MRT Routing Information Types
+
+ The following Types are currently defined for the MRT format. Types
+ 11 and 12 were defined in the MRT Toolkit package. The BGP4MP Type,
+ number 16, was initially defined in the Zebra routing software
+ package. The BGP4MP_ET, ISIS, and ISIS_ET Types were initially
+ defined in the Sprint Labs Python Routing Toolkit (PyRT). The OSPFv3
+ and OSPFv3_ET Types are newly defined types created for the OSPFv3
+ routing protocol.
+
+ 11 OSPF
+ 12 TABLE_DUMP
+ 13 TABLE_DUMP_V2
+ 16 BGP4MP
+ 17 BGP4MP_ET
+ 32 ISIS
+ 33 ISIS_ET
+ 48 OSPFv3
+ 49 OSPFv3_ET
+
+5.1. OSPF Type
+
+ This Type supports the OSPF Protocol as defined in RFC 2328
+ [RFC2328]. The Subtype field may contain two possible values:
+
+ 0 OSPF_STATE_CHANGE
+ 1 OSPF_LSA_UPDATE
+
+ The format of the MRT Message field for the OSPF Type is as follows:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Remote IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | OSPF Message Contents (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 9]
+
+Internet-Draft MRT Format March 2010
+
+
+5.2. TABLE_DUMP Type
+
+ The TABLE_DUMP Type is used to encode the contents of a BGP Routing
+ Information Base (RIB). Each RIB entry is encoded in a distinct
+ sequential MRT record. The Subtype field is used to encode whether
+ the RIB entry contains IPv4 or IPv6 addresses. There are two
+ possible values for the Subtype as shown below.
+
+ 1 AFI_IPv4
+ 2 AFI_IPv6
+
+ The format of the TABLE_DUMP Type is illustrated below.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | View # | Sequence number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Prefix (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Prefix Length | Status |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Originated Time |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS | Attribute Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP Attribute... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The View field is normally 0 and is intended for cases where an
+ implementation may have multiple RIB views (such as a route server).
+ In cases where multiple RIB views are present, an implementation may
+ use the the view field to distinguish entries from each view. The
+ Sequence field is a simple incremental counter for each RIB entry. A
+ typical RIB dump will exceed the 16-bit bounds of this counter and
+ implementation should simply wrap back to zero and continue
+ incrementing the counter in such cases.
+
+ The Prefix field contains the IP address of a particular RIB entry.
+ The size of this field is dependent on the value of the Subtype for
+ this message. For AFI_IPv4, this field is 4 octets, for AFI_IPv6, it
+ is 16 octets in length. The Prefix Length field indicates the length
+ in bits of the prefix mask for the preceding Prefix field.
+
+ The Status octet is not used in the TABLE_DUMP Type and SHOULD be set
+ to 1.
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 10]
+
+Internet-Draft MRT Format March 2010
+
+
+ The Originated Time contains the 4-octet time at which this prefix
+ was heard. The value represents the time in seconds since 1 January
+ 1970 00:00:00 UTC.
+
+ The Peer IP field is the IP address of the peer which provided the
+ update for this RIB entry. As with the Prefix field, the size of
+ this field is dependent on the Subtype. AFI_IPv4 indicates a 4 octet
+ field and an IPv4 address, while a Subtype of AFI_IPv6 requires a 16
+ octet field and an IPv6 address. The Peer AS field contains the AS
+ number of the peer.
+
+ Attribute length is the length of Attribute field and is 2-octets.
+ The Attribute field contains the attribute information for the RIB
+ entry.
+
+5.3. TABLE_DUMP_V2 Type
+
+ The TABLE_DUMP_V2 Type updates the TABLE_DUMP Type to include 4-Byte
+ ASN support and full support for BGP Multiprotocol extensions. It
+ also improves upon the space efficiency of the TABLE_DUMP Type by
+ employing an index table for peers and permitting a single MRT record
+ per NLRI entry. The following subtypes are used with the
+ TABLE_DUMP_V2 Type.
+
+ 1 PEER_INDEX_TABLE
+ 2 RIB_IPV4_UNICAST
+ 3 RIB_IPV4_MULTICAST
+ 4 RIB_IPV6_UNICAST
+ 5 RIB_IPV6_MULTICAST
+ 6 RIB_GENERIC
+
+ An initial PEER_INDEX_TABLE MRT record provides the BGP ID of the
+ collector, an optional view name, and a list of indexed peers.
+ Following the PEER_INDEX_TABLE MRT record, a series of MRT records
+ are used to encode RIB table entries. This series of MRT records use
+ subtypes 2-6 and are separate from the PEER_INDEX_TABLE MRT record
+ itself and include full MRT record headers. The header of the
+ PEER_INDEX_TABLE Subtype is shown below. The View Name is optional
+ and, if not present, the View Name Length MUST be set to 0. The View
+ Name encoding MUST follow the UTF-8 transformation format.
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 11]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Collector BGP ID |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | View Name Length | View Name (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer Count |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The format of the peer entries is shown below. The PEER_INDEX_TABLE
+ record contains Peer Count peer entries.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer Type |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer BGP ID |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The Peer Type, Peer BGP ID, Peer IP, and Peer AS fields are repeated
+ as indicated by the Peer Count field. The position of the Peer in
+ the PEER_INDEX_TABLE is used as an index in the subsequent
+ TABLE_DUMP_V2 MRT records. The index number begins with 0.
+
+ The Peer Type field is a bit field which encodes the type of the AS
+ and IP address as follows:
+
+ Bit 0 - unset for IPv4 Peer IP address, set for IPv6
+ Bit 1 - unset when Peer AS is 16 bits, set when it's 32 bits
+
+ The records which follow the PEER_INDEX_TABLE record constitute the
+ RIB entries and include a header which specifies a sequence number,
+ NLRI, and a count of the number of RIB entries which follow.
+
+ The format for the RIB_IPV4_UNICAST, RIB_IPV4_MULTICAST,
+ RIB_IPV6_UNICAST, and RIB_IPV6_MULTICAST headers are shown below.
+ The Prefix Length and Prefix fields are encoded in the same manner as
+ the BGP NLRI encoding for IPV4 and IPV6 prefixes. Namely, the Prefix
+ field contains address prefixes followed by enough trailing bits to
+ make the end of the field fall on an octet boundary. Note that the
+ value of trailing bits is irrelevant.
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 12]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Sequence number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Prefix Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Prefix (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Entry Count |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The RIB_GENERIC header is shown below. It includes Address Family
+ Identifier (AFI), Subsequent AFI and a single NLRI entry. The NLRI
+ information is specific to the AFI and SAFI values. An
+ implementation which does not recognize particular AFI and SAFI
+ values SHOULD discard the remainder of the MRT record.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Sequence number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Address Family Identifier |Subsequent AFI |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Network Layer Reachability Information (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Entry Count |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The RIB entry headers are followed by a series of RIB entries which
+ are repeated Entry Count times. These entries share a common format
+ as shown below. They include a Peer Index from the PEER_INDEX_TABLE
+ MRT record, an originated time for the RIB entry, and the BGP path
+ attribute length and attributes encoded as provided in a BGP Update
+ message.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer Index |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Originated Time |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Attribute Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP Attributes... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 13]
+
+Internet-Draft MRT Format March 2010
+
+
+ There is one exception to the encoding of BGP attributes for the BGP
+ MP_REACH_NLRI attribute (BGP Type Code 14) [RFC 4760]. Since the
+ AFI, SAFI, and NLRI information is already encoded in the
+ MULTIPROTOCOL header, only the Next Hop Address Length and Next Hop
+ Address fields are included. The Reserved field is omitted. The
+ attribute length is also adjusted to reflect only the length of the
+ Next Hop Address Length and Next Hop Address fields.
+
+5.4. BGP4MP Type
+
+ This Type was initially defined in the Zebra software package for the
+ BGP protocol with multiprotocol extension support as defined by RFC
+ 4760 [RFC4760]. It supersedes the BGP, BGP4PLUS, BGP4PLUS_01 Types.
+ The BGP4MP Type has six Subtypes which are defined as follows:
+
+ 0 BGP4MP_STATE_CHANGE
+ 1 BGP4MP_MESSAGE
+ 4 BGP4MP_MESSAGE_AS4
+ 5 BGP4MP_STATE_CHANGE_AS4
+ 6 BGP4MP_MESSAGE_LOCAL
+ 7 BGP4MP_MESSAGE_AS4_LOCAL
+
+5.4.1. BGP4MP_STATE_CHANGE Subtype
+
+ This record is used to encode state changes in the BGP finite state
+ machine. The BGP FSM states are encoded in the Old State and New
+ State fields to indicate the previous and current state. In some
+ cases, the Peer AS number may be undefined. In such cases, the value
+ of this field may be set to zero. The format is illustrated below:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number | Local AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Interface Index | Address Family |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Old State | New State |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 14]
+
+Internet-Draft MRT Format March 2010
+
+
+ The FSM states are defined in RFC 4271 [RFC4271], Section 8.2.2.
+ Both the old state value and the new state value are encoded as
+ 2-octet numbers. The state values are defined numerically as
+ follows:
+
+ 1 Idle
+ 2 Connect
+ 3 Active
+ 4 OpenSent
+ 5 OpenConfirm
+ 6 Established
+
+ The BGP4MP_STATE_CHANGE message also includes interface index and
+ Address Family fields. The interface index provides the interface
+ number of the peering session. The index value is OPTIONAL and MAY
+ be zero if unknown or unsupported. The Address Family indicates what
+ types of addresses are in the the address fields. At present, the
+ following AFI Types are supported:
+
+ 1 AFI_IPv4
+ 2 AFI_IPv6
+
+5.4.2. BGP4MP_MESSAGE Subtype
+
+ This Subtype is used to encode BGP Messages. It can be used to
+ encode any Type of BGP message. The entire BGP message is
+ encapsulated in the BGP Message field, including the 16-octet marker,
+ the 2-octet length, and the 1-octet type fields. Note that the
+ BGP4MP_MESSAGE Subtype does not support 4-Byte AS numbers. Further,
+ the AS_PATH contained in these messages MUST only consist of 2-Byte
+ AS numbers. The BGP4MP_MESSAGE_AS4 Subtype updates the
+ BGP4MP_MESSAGE Subtype in order to support 4-Byte AS numbers. The
+ BGP4MP_MESSAGE fields are shown below:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number | Local AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Interface Index | Address Family |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP Message... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 15]
+
+Internet-Draft MRT Format March 2010
+
+
+ The interface index provides the interface number of the peering
+ session. The index value is OPTIONAL and MAY be zero if unknown or
+ unsupported. The Address Family indicates what types of addresses
+ are in the the subsequent address fields. At present, the following
+ AFI Types are supported:
+
+ 1 AFI_IPv4
+ 2 AFI_IPv6
+
+ Note that the Address Family value only applies to the IP addresses
+ contained in the MRT header. The BGP4MP_MESSAGE Subtype is otherwise
+ transparent to the contents of the actual message which may contain
+ any valid AFI/SAFI values. Only one BGP message may be encoded in
+ the BGP4MP_MESSAGE Subtype.
+
+5.4.3. BGP4MP_MESSAGE_AS4 Subtype
+
+ This Subtype updates the BGP4MP_MESSAGE Subtype to support 4-Byte
+ Autonomous System numbers. The BGP4MP_MESSAGE_AS4 Subtype is
+ otherwise identical to the BGP4MP_MESSAGE Subtype. The AS_PATH in
+ these messages MUST only consist of 4-Byte AS numbers. The
+ BGP4MP_MESSAGE_AS4 fields are shown below:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Interface Index | Address Family |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP Message... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+5.4.4. BGP4MP_STATE_CHANGE_AS4 Subtype
+
+ This Subtype updates the BGP4MP_STATE_CHANGE Subtype to support
+ 4-Byte Autonomous System numbers. As with the BGP4MP_STATE_CHANGE
+ Subtype, the BGP FSM states are encoded in the Old State and New
+ State fields to indicate the previous and current state. Aside from
+ the extension of the peer and local AS fields to 4-Bytes, this
+ subtype is otherwise identical to the BGP4MP_STATE_CHANGE Subtype.
+ The BGP4MP_STATE_CHANGE_AS4 fields are shown below:
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 16]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Interface Index | Address Family |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Old State | New State |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+5.4.5. BGP4MP_MESSAGE_LOCAL Subtype
+
+ Implementations of MRT have largely focused on collecting remotely
+ generated BGP messages in a passive route collector role. However,
+ for active BGP implementations, it can be useful to archive locally
+ generated BGP messages in addition to remote messages. This subtype
+ is added to indicated a locally generated BGP message. The fields
+ remain identical to the BGP4MP_MESSAGE type including the Peer and
+ Local IP and AS fields. The Local fields continue to refer to the
+ local IP and AS number of the collector which generated the message
+ and the Peer IP and AS fields refer to the receipient of the
+ generated BGP messages.
+
+5.4.6. BGP4MP_MESSAGE_AS4_LOCAL Subtype
+
+ As with the BGP4MP_MESSAGE_LOCAL type, this type indicate locally
+ generated messages. The fields are identical to the
+ BGP4MP_MESSAGE_AS4 message type.
+
+5.5. BGP4MP_ET Type
+
+ This Type was initially defined in the Sprint Labs Python Routing
+ Toolkit (PyRT). It extends the MRT common header field to include a
+ 32BIT microsecond timestamp field. The type and subtype field
+ definitions remain as defined for the BGP4MP Type. The 32BIT
+ microsecond timestamp immediately follows the length field in the MRT
+ common header and precedes all other fields in the message. The
+ 32BIT microsecond field is included in the computation of the length
+ field value. The MRT common header modification is illustrated
+ below.
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 17]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Timestamp |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type | Subtype |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | microsecond timestamp |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Message... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+5.6. ISIS Type
+
+ This Type was initially defined in the Sprint Labs Python Routing and
+ supports the IS-IS routing protocol as defined in RFC 1195 [RFC1195].
+ There is no Type specific header for the ISIS Type. The Subtype code
+ for this Type is undefined. The ISIS PDU directly follows the MRT
+ common header fields.
+
+5.7. ISIS_ET Type
+
+ The ISIS_ET Type extends the ISIS Type to support microsecond
+ timestamps. As with the BGP4MP_ET Type, a 32BIT microsecond
+ timestamp field is appended to the MRT common header after the length
+ field. The ISIS_ET Type is otherwise identical to the ISIS Type.
+
+5.8. OSPFv3 Type
+
+ The OSPFv3 Type extends the original OSPF Type to support IPv6
+ addresses for the OSPFv3 protocol as defined in RFC 5340 [RFC5340].
+ The format of the MRT Message field for the OSPFv3 Type is as
+ follows:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Address Family |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Remote IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | OSPF Message Contents (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 18]
+
+Internet-Draft MRT Format March 2010
+
+
+5.9. OSPFv3_ET Type
+
+ The OSPFv3_ET Type extends the OSPFv3 Type to support microsecond
+ timestamps. As with the BGP4MP_ET Type, a 32BIT microsecond
+ timestamp field is appended to the MRT common header after the length
+ field and its length is included in the calculation of the length
+ field value. The OSPFv3_ET Type is otherwise identical to the OSPFv3
+ Type.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 19]
+
+Internet-Draft MRT Format March 2010
+
+
+6. IANA Considerations
+
+ This section provides guidance to the Internet Assigned Numbers
+ Authority (IANA) regarding registration of values related to the MRT
+ specification, in accordance with BCP 26, RFC 5226 [RFC5226].
+
+ There are two name spaces in MRT that require registration: Type
+ Codes and Subtype Codes.
+
+ MRT is not intended as a general-purpose specification for protocol
+ information export, and allocations should not be made for purposes
+ unrelated to routing protocol information export.
+
+ The following policies are used here with the meanings defined in BCP
+ 26: "Specification Required", "IETF Consensus", "Experimental Use",
+ "First Come First Served".
+
+6.1. Type Codes
+
+ Type Codes have a range from 0 to 65535, of which 1-64 have been
+ allocated. New Type Codes MUST be allocated starting at 65. Type
+ Codes 65 - 511 are to be assigned by IETF Review. Type Codes 512 -
+ 2047 are assigned based on Specification Required. Type Codes 2048 -
+ 64511 are available on a First Come First Served policy. Type Codes
+ 64512 - 65534 are available for Experimental Use. The Type Code
+ Values of 0 and 65535 are reserved.
+
+6.2. Subtype Codes
+
+ Subtype Codes have a range from 0 to 65535. Subtype definitions are
+ specific to a particular Type Code definition. New Subtype Code
+ definition must reference an existing Type Code to which the Subtype
+ belongs. Subtype assignmnents to Type Codes 0 - 511 are to be
+ assigned by IETF Review. Subtype assignments for the remaning Type
+ Codes follow the assignment rules for the Type Codes to which they
+ belong.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 20]
+
+Internet-Draft MRT Format March 2010
+
+
+7. Security Considerations
+
+ The MRT Format utilizes a structure which can store routing protocol
+ information data. The fields defined in the MRT specification are of
+ a descriptive nature and provide information that is useful to
+ facilitate the analysis of routing data. As such, the fields
+ currently defined in the MRT specification do not in themselves
+ create additional security risks, since the fields are not used to
+ induce any particular behavior by the recipient application.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 21]
+
+Internet-Draft MRT Format March 2010
+
+
+8. References
+
+8.1. Normative References
+
+ [RFC1058] Hedrick, C., "Routing Information Protocol", RFC 1058,
+ June 1988.
+
+ [RFC1195] Callon, R., "Use of OSI IS-IS for routing in TCP/IP and
+ dual environments", RFC 1195, December 1990.
+
+ [RFC2080] Malkin, G. and R. Minnear, "RIPng for IPv6", RFC 2080,
+ January 1997.
+
+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
+ Requirement Levels", BCP 14, RFC 2119, March 1997.
+
+ [RFC2328] Moy, J., "OSPF Version 2", STD 54, RFC 2328, April 1998.
+
+ [RFC4271] Rekhter, Y., Li, T., and S. Hares, "A Border Gateway
+ Protocol 4 (BGP-4)", RFC 4271, January 2006.
+
+ [RFC4760] Bates, T., Chandra, R., Katz, D., and Y. Rekhter,
+ "Multiprotocol Extensions for BGP-4", RFC 4760,
+ January 2007.
+
+ [RFC5226] Narten, T. and H. Alvestrand, "Guidelines for Writing an
+ IANA Considerations Section in RFCs", BCP 26, RFC 5226,
+ May 2008.
+
+ [RFC5340] Coltun, R., Ferguson, D., Moy, J., and A. Lindem, "OSPF
+ for IPv6", RFC 5340, July 2008.
+
+8.2. Informative References
+
+ [MRT PROG GUIDE]
+ Labovitz, C., "MRT Programmer's Guide", November 1999,
+ <http://www.merit.edu/networkresearch/mrtprogrammer.pdf>.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 22]
+
+Internet-Draft MRT Format March 2010
+
+
+Appendix A. Deprecated MRT types
+
+ This Appendix lists deprecated MRT types. These types are documented
+ for informational purposes only. While documented in some
+ references, they are not known to have been generally implemented.
+
+A.1. Deprecated MRT Informational Types
+
+ The deprecated MRT Informational Types are defined below:
+
+ 0 NULL
+ 2 DIE
+ 4 PEER_DOWN
+
+A.1.1. NULL Type
+
+ The NULL Type message causes no operation.
+
+A.1.2. DIE Type
+
+ The DIE Type signals a remote MRT repository it should stop accepting
+ messages.
+
+A.1.3. PEER_DOWN Type
+
+ The PEER_DOWN message was intended to indicate that a collector had
+ lost association with a BGP peer. However, the MRT format provides
+ BGP state change message types which duplicate this functionality.
+
+A.2. Deprecated MRT Routing Information Types
+
+ 5 BGP
+ 6 RIP
+ 7 IDRP
+ 8 RIPNG
+ 9 BGP4PLUS
+ 10 BGP4PLUS_01
+
+A.2.1. BGP Type
+
+ The BGP Type indicates the Message field contains BGP routing
+ information. The BGP routing protocol is defined in RFC 4271
+ [RFC4271]. The information in the message is dependent on the
+ Subtype value. The BGP Type and all associated Subtypes below are
+ considered to be deprecated by the BGP4MP Type.
+
+ The following BGP Subtypes are defined for the MRT BGP Type. As with
+ the BGP Type itself, they are all considered to be deprecated.
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 23]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 BGP_NULL
+ 1 BGP_UPDATE
+ 2 BGP_PREF_UPDATE
+ 3 BGP_STATE_CHANGE
+ 4 BGP_SYNC
+ 5 BGP_OPEN
+ 6 BGP_NOTIFY
+ 7 BGP_KEEPALIVE
+
+A.2.1.1. BGP_NULL Subtype
+
+ The BGP_NULL Subtype is a reserved Subtype.
+
+A.2.1.2. BGP_UPDATE Subtype
+
+ The BGP_UPDATE Subtype is used to encode BGP UPDATE messages. The
+ format of the MRT Message field for this Subtype is as follows:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP UPDATE Contents (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The BGP UPDATE Contents include the entire BGP UPDATE message which
+ follows the BGP Message Header. The BGP Message Header itself is not
+ included. The Peer AS number and IP address fields contain the AS
+ number and IP address of the remote system which are generating the
+ BGP UPDATE messages. The Local AS number and IP address fields
+ contain the AS number and IP address of the local collector system
+ which is archiving the messages.
+
+A.2.1.3. BGP_PREF_UPDATE Subtype
+
+ The BGP_PREF_UPDATE Subtype is not defined.
+
+A.2.1.4. BGP_STATE_CHANGE Subtype
+
+ The BGP_STATE_CHANGE Subtype is used to record changes in the BGP
+ finite state machine. These FSM states are defined in RFC 4271
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 24]
+
+Internet-Draft MRT Format March 2010
+
+
+ [RFC4271], Section 8.2.2. Both the old state value and the new state
+ value are encoded as 2-octet numbers. The state values are defined
+ numerically as follows:
+
+ 1 Idle
+ 2 Connect
+ 3 Active
+ 4 OpenSent
+ 5 OpenConfirm
+ 6 Established
+
+ The format of the MRT Message field is as follows:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Old State | New State |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+A.2.1.5. BGP_SYNC Subtype
+
+ The BGP_SYNC Subtype was intended to convey a system file name where
+ BGP Table Dump messages should be recorded. The View # was to
+ correspond to the View # provided in the TABLE_DUMP Type messages.
+ There are no known implementations of this subtype and it SHOULD be
+ ignored. The following format applies to this Subtype:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | View # |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | File Name... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The File Name is terminated with a NULL (0) character.
+
+A.2.1.6. BGP_OPEN Subtype
+
+ The BGP_OPEN Subtype is used to encode BGP OPEN messages. The format
+ of the MRT Message field for this Subtype is the same as the
+ BGP_UPDATE, however, the last field contains the contents of the BGP
+ OPEN message.
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 25]
+
+Internet-Draft MRT Format March 2010
+
+
+A.2.1.7. BGP_NOTIFY Subtype
+
+ The BGP_NOTIFY Subtype is used to encode BGP NOTIFICATION messages.
+ The format of the MRT Message field for this Subtype is the same as
+ the BGP_UPDATE, however, the last field contains the contents of the
+ BGP NOTIFICATION message.
+
+A.2.1.8. BGP_KEEPALIVE Subtype
+
+ The BGP_KEEPALIVE Subtype is used to encode BGP KEEPALIVE messages.
+ The format of the MRT Message field for this Subtype is the same as
+ the BGP_UPDATE, however, the last field contains no information.
+
+A.2.2. RIP Type
+
+ The RIP Type is used to export RIP protocol packets as defined in RFC
+ 1058 [RFC1058]. The Subtype field is currently reserved for this
+ Type and SHOULD be set to 0.
+
+ The format of the MRT Message field for the RIP Type is as follows:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | RIP Message Contents (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+A.2.3. IDRP Type
+
+ The IDRP Type is used to export Inter-Domain-Routing Protocol (IDRP)
+ protocol information as defined in the ISO/IEC 10747 standard. The
+ Subtype field is unused. This Type is deprecated due to lack of
+ deployment of IDRP.
+
+A.2.4. RIPNG Type
+
+ The RIPNG Type is used to export RIPNG protocol packets as defined in
+ RFC 2080 [RFC2080]. The RIPNG protocol updates the RIP protocol to
+ support IPv6. The Subtype field is currently reserved for this Type
+ and SHOULD be set to 0.
+
+ The format of the MRT Message field for the RIPNG Type is as follows:
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 26]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ ~ Peer IPv6 address ~
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ ~ Local IPv6 address ~
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | RIPNG Message Contents (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+A.2.5. BGP4PLUS and BGP4PLUS_01 Types
+
+ The BGP4PLUS and BGP4PLUS_01 Types were defined to support IPv6 BGP
+ routing information. The BGP4PLUS Type was specified based on the
+ initial Internet Draft for Multiprotocol Extensions to BGP-4. The
+ BGP4PLUS_01 Type was specified to correspond to the -01 revision of
+ this Internet Draft. The two Types share the same definitions in
+ terms of their MRT format specifications.
+
+ The Subtype field definitions are shared with the BGP Type, however,
+ the address fields in the BGP_UPDATE, BGP_OPEN, BGP_NOTIFY,
+ BGP_KEEPALIVE, and BGP_STATE_CHANGE Subtype messages are extended to
+ 16 octets for IPv6 addresses. As with the BGP Type, the BGP4PLUS and
+ BGP4PLUS_01 Types are deprecated as they superseded by the BGP4MP
+ Type.
+
+A.2.6. Deprecated BGP4MP Subtypes
+
+ The following two subtypes of the BGP4MP Type are considered to be
+ deprecated.
+
+ 2 BGP4MP_ENTRY
+ 3 BGP4MP_SNAPSHOT
+
+A.2.6.1. BGP4MP_ENTRY Subtype
+
+ This Subtype is similar to the TABLE_DUMP Type and is used to record
+ RIB table entries. It extends the TABLE_DUMP Type to include true
+ multiprotocol support. However, this Type does not support 4-Byte AS
+ numbers and has not been widely implemented. This Type is deprecated
+ in favor of the TABLE_DUMP_V2 which includes 4-Byte AS number support
+ and a more compact format.
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 27]
+
+Internet-Draft MRT Format March 2010
+
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer AS number | Local AS number |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Interface Index | Address Family |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Peer IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local IP address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | View # | Status |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Time last change |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Address Family | SAFI | Next-Hop-Len |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Next Hop Address (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Prefix Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Address Prefix (variable) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Attribute Length |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP Attribute... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+A.2.6.2. BGP4MP_SNAPSHOT Subtype
+
+ This Subtype was intended to convey a system file name where
+ BGP4MP_ENTRY messages should be recorded. It is similar to the
+ BGP_SYNC message Subtype and is deprecated.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | View # |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | File Name... (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 28]
+
+Internet-Draft MRT Format March 2010
+
+
+Authors' Addresses
+
+ Larry Blunk
+ Merit Network
+
+ Email: ljb@merit.edu
+
+
+ Manish Karir
+ Merit Network
+
+ Email: mkarir@merit.edu
+
+
+ Craig Labovitz
+ Arbor Networks
+
+ Email: labovit@arbor.net
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Blunk, et al. Expires September 9, 2010 [Page 29]
+
+
diff --git a/doc/rfc1771.txt b/doc/rfc1771.txt
@@ -0,0 +1,3195 @@
+
+
+
+
+
+
+Network Working Group Y. Rekhter
+Request for Comments: 1771 T.J. Watson Research Center, IBM Corp.
+Obsoletes: 1654 T. Li
+Category: Standards Track cisco Systems
+ Editors
+ March 1995
+
+
+ A Border Gateway Protocol 4 (BGP-4)
+
+Status of this Memo
+
+ This document specifies an Internet standards track protocol for the
+ Internet community, and requests discussion and suggestions for
+ improvements. Please refer to the current edition of the "Internet
+ Official Protocol Standards" (STD 1) for the standardization state
+ and status of this protocol. Distribution of this memo is unlimited.
+
+Abstract
+
+ This document, together with its companion document, "Application of
+ the Border Gateway Protocol in the Internet", define an inter-
+ autonomous system routing protocol for the Internet.
+
+1. Acknowledgements
+
+ This document was originally published as RFC 1267 in October 1991,
+ jointly authored by Kirk Lougheed (cisco Systems) and Yakov Rekhter
+ (IBM).
+
+ We would like to express our thanks to Guy Almes (ANS), Len Bosack
+ (cisco Systems), and Jeffrey C. Honig (Cornell University) for their
+ contributions to the earlier version of this document.
+
+ We like to explicitly thank Bob Braden (ISI) for the review of the
+ earlier version of this document as well as his constructive and
+ valuable comments.
+
+ We would also like to thank Bob Hinden, Director for Routing of the
+ Internet Engineering Steering Group, and the team of reviewers he
+ assembled to review the previous version (BGP-2) of this document.
+ This team, consisting of Deborah Estrin, Milo Medin, John Moy, Radia
+ Perlman, Martha Steenstrup, Mike St. Johns, and Paul Tsuchiya, acted
+ with a strong combination of toughness, professionalism, and
+ courtesy.
+
+
+
+
+
+
+Rekhter & Li [Page 1]
+
+RFC 1771 BGP-4 March 1995
+
+
+ This updated version of the document is the product of the IETF IDR
+ Working Group with Yakov Rekhter and Tony Li as editors. Certain
+ sections of the document borrowed heavily from IDRP [7], which is the
+ OSI counterpart of BGP. For this credit should be given to the ANSI
+ X3S3.3 group chaired by Lyman Chapin (BBN) and to Charles Kunzinger
+ (IBM Corp.) who was the IDRP editor within that group. We would also
+ like to thank Mike Craren (Proteon, Inc.), Dimitry Haskin (Bay
+ Networks, Inc.), John Krawczyk (Bay Networks, Inc.), and Paul Traina
+ (cisco Systems) for their insightful comments.
+
+ We would like to specially acknowledge numerous contributions by
+ Dennis Ferguson (MCI).
+
+ The work of Yakov Rekhter was supported in part by the National
+ Science Foundation under Grant Number NCR-9219216.
+
+2. Introduction
+
+ The Border Gateway Protocol (BGP) is an inter-Autonomous System
+ routing protocol. It is built on experience gained with EGP as
+ defined in RFC 904 [1] and EGP usage in the NSFNET Backbone as
+ described in RFC 1092 [2] and RFC 1093 [3].
+
+ The primary function of a BGP speaking system is to exchange network
+ reachability information with other BGP systems. This network
+ reachability information includes information on the list of
+ Autonomous Systems (ASs) that reachability information traverses.
+ This information is sufficient to construct a graph of AS
+ connectivity from which routing loops may be pruned and some policy
+ decisions at the AS level may be enforced.
+
+ BGP-4 provides a new set of mechanisms for supporting classless
+ interdomain routing. These mechanisms include support for
+ advertising an IP prefix and eliminates the concept of network
+ "class" within BGP. BGP-4 also introduces mechanisms which allow
+ aggregation of routes, including aggregation of AS paths. These
+ changes provide support for the proposed supernetting scheme [8, 9].
+
+ To characterize the set of policy decisions that can be enforced
+ using BGP, one must focus on the rule that a BGP speaker advertise to
+ its peers (other BGP speakers which it communicates with) in
+ neighboring ASs only those routes that it itself uses. This rule
+ reflects the "hop-by-hop" routing paradigm generally used throughout
+ the current Internet. Note that some policies cannot be supported by
+ the "hop-by-hop" routing paradigm and thus require techniques such as
+ source routing to enforce. For example, BGP does not enable one AS
+ to send traffic to a neighboring AS intending that the traffic take a
+ different route from that taken by traffic originating in the
+
+
+
+Rekhter & Li [Page 2]
+
+RFC 1771 BGP-4 March 1995
+
+
+ neighboring AS. On the other hand, BGP can support any policy
+ conforming to the "hop-by-hop" routing paradigm. Since the current
+ Internet uses only the "hop-by-hop" routing paradigm and since BGP
+ can support any policy that conforms to that paradigm, BGP is highly
+ applicable as an inter-AS routing protocol for the current Internet.
+
+ A more complete discussion of what policies can and cannot be
+ enforced with BGP is outside the scope of this document (but refer to
+ the companion document discussing BGP usage [5]).
+
+ BGP runs over a reliable transport protocol. This eliminates the
+ need to implement explicit update fragmentation, retransmission,
+ acknowledgement, and sequencing. Any authentication scheme used by
+ the transport protocol may be used in addition to BGP's own
+ authentication mechanisms. The error notification mechanism used in
+ BGP assumes that the transport protocol supports a "graceful" close,
+ i.e., that all outstanding data will be delivered before the
+ connection is closed.
+
+ BGP uses TCP [4] as its transport protocol. TCP meets BGP's
+ transport requirements and is present in virtually all commercial
+ routers and hosts. In the following descriptions the phrase
+ "transport protocol connection" can be understood to refer to a TCP
+ connection. BGP uses TCP port 179 for establishing its connections.
+
+ This document uses the term `Autonomous System' (AS) throughout. The
+ classic definition of an Autonomous System is a set of routers under
+ a single technical administration, using an interior gateway protocol
+ and common metrics to route packets within the AS, and using an
+ exterior gateway protocol to route packets to other ASs. Since this
+ classic definition was developed, it has become common for a single
+ AS to use several interior gateway protocols and sometimes several
+ sets of metrics within an AS. The use of the term Autonomous System
+ here stresses the fact that, even when multiple IGPs and metrics are
+ used, the administration of an AS appears to other ASs to have a
+ single coherent interior routing plan and presents a consistent
+ picture of what destinations are reachable through it.
+
+ The planned use of BGP in the Internet environment, including such
+ issues as topology, the interaction between BGP and IGPs, and the
+ enforcement of routing policy rules is presented in a companion
+ document [5]. This document is the first of a series of documents
+ planned to explore various aspects of BGP application. Please send
+ comments to the BGP mailing list (bgp@ans.net).
+
+
+
+
+
+
+
+Rekhter & Li [Page 3]
+
+RFC 1771 BGP-4 March 1995
+
+
+3. Summary of Operation
+
+ Two systems form a transport protocol connection between one another.
+ They exchange messages to open and confirm the connection parameters.
+ The initial data flow is the entire BGP routing table. Incremental
+ updates are sent as the routing tables change. BGP does not require
+ periodic refresh of the entire BGP routing table. Therefore, a BGP
+ speaker must retain the current version of the entire BGP routing
+ tables of all of its peers for the duration of the connection.
+ KeepAlive messages are sent periodically to ensure the liveness of
+ the connection. Notification messages are sent in response to errors
+ or special conditions. If a connection encounters an error
+ condition, a notification message is sent and the connection is
+ closed.
+
+ The hosts executing the Border Gateway Protocol need not be routers.
+ A non-routing host could exchange routing information with routers
+ via EGP or even an interior routing protocol. That non-routing host
+ could then use BGP to exchange routing information with a border
+ router in another Autonomous System. The implications and
+ applications of this architecture are for further study.
+
+ If a particular AS has multiple BGP speakers and is providing transit
+ service for other ASs, then care must be taken to ensure a consistent
+ view of routing within the AS. A consistent view of the interior
+ routes of the AS is provided by the interior routing protocol. A
+ consistent view of the routes exterior to the AS can be provided by
+ having all BGP speakers within the AS maintain direct BGP connections
+ with each other. Using a common set of policies, the BGP speakers
+ arrive at an agreement as to which border routers will serve as
+ exit/entry points for particular destinations outside the AS. This
+ information is communicated to the AS's internal routers, possibly
+ via the interior routing protocol. Care must be taken to ensure that
+ the interior routers have all been updated with transit information
+ before the BGP speakers announce to other ASs that transit service is
+ being provided.
+
+ Connections between BGP speakers of different ASs are referred to as
+ "external" links. BGP connections between BGP speakers within the
+ same AS are referred to as "internal" links. Similarly, a peer in a
+ different AS is referred to as an external peer, while a peer in the
+ same AS may be described as an internal peer.
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 4]
+
+RFC 1771 BGP-4 March 1995
+
+
+3.1 Routes: Advertisement and Storage
+
+ For purposes of this protocol a route is defined as a unit of
+ information that pairs a destination with the attributes of a path to
+ that destination:
+
+ - Routes are advertised between a pair of BGP speakers in UPDATE
+ messages: the destination is the systems whose IP addresses are
+ reported in the Network Layer Reachability Information (NLRI)
+ field, and the the path is the information reported in the path
+ attributes fields of the same UPDATE message.
+
+ - Routes are stored in the Routing Information Bases (RIBs):
+ namely, the Adj-RIBs-In, the Loc-RIB, and the Adj-RIBs-Out. Routes
+ that will be advertised to other BGP speakers must be present in
+ the Adj-RIB-Out; routes that will be used by the local BGP speaker
+ must be present in the Loc-RIB, and the next hop for each of these
+ routes must be present in the local BGP speaker's forwarding
+ information base; and routes that are received from other BGP
+ speakers are present in the Adj-RIBs-In.
+
+ If a BGP speaker chooses to advertise the route, it may add to or
+ modify the path attributes of the route before advertising it to a
+ peer.
+
+ BGP provides mechanisms by which a BGP speaker can inform its peer
+ that a previously advertised route is no longer available for use.
+ There are three methods by which a given BGP speaker can indicate
+ that a route has been withdrawn from service:
+
+ a) the IP prefix that expresses destinations for a previously
+ advertised route can be advertised in the WITHDRAWN ROUTES field
+ in the UPDATE message, thus marking the associated route as being
+ no longer available for use
+
+ b) a replacement route with the same Network Layer Reachability
+ Information can be advertised, or
+
+ c) the BGP speaker - BGP speaker connection can be closed, which
+ implicitly removes from service all routes which the pair of
+ speakers had advertised to each other.
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 5]
+
+RFC 1771 BGP-4 March 1995
+
+
+3.2 Routing Information Bases
+
+ The Routing Information Base (RIB) within a BGP speaker consists of
+ three distinct parts:
+
+ a) Adj-RIBs-In: The Adj-RIBs-In store routing information that has
+ been learned from inbound UPDATE messages. Their contents
+ represent routes that are available as an input to the Decision
+ Process.
+
+ b) Loc-RIB: The Loc-RIB contains the local routing information
+ that the BGP speaker has selected by applying its local policies
+ to the routing information contained in its Adj-RIBs-In.
+
+ c) Adj-RIBs-Out: The Adj-RIBs-Out store the information that the
+ local BGP speaker has selected for advertisement to its peers. The
+ routing information stored in the Adj-RIBs-Out will be carried in
+ the local BGP speaker's UPDATE messages and advertised to its
+ peers.
+
+ In summary, the Adj-RIBs-In contain unprocessed routing information
+ that has been advertised to the local BGP speaker by its peers; the
+ Loc-RIB contains the routes that have been selected by the local BGP
+ speaker's Decision Process; and the Adj-RIBs-Out organize the routes
+ for advertisement to specific peers by means of the local speaker's
+ UPDATE messages.
+
+ Although the conceptual model distinguishes between Adj-RIBs-In,
+ Loc-RIB, and Adj-RIBs-Out, this neither implies nor requires that an
+ implementation must maintain three separate copies of the routing
+ information. The choice of implementation (for example, 3 copies of
+ the information vs 1 copy with pointers) is not constrained by the
+ protocol.
+
+4. Message Formats
+
+ This section describes message formats used by BGP.
+
+ Messages are sent over a reliable transport protocol connection. A
+ message is processed only after it is entirely received. The maximum
+ message size is 4096 octets. All implementations are required to
+ support this maximum message size. The smallest message that may be
+ sent consists of a BGP header without a data portion, or 19 octets.
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 6]
+
+RFC 1771 BGP-4 March 1995
+
+
+4.1 Message Header Format
+
+ Each message has a fixed-size header. There may or may not be a data
+ portion following the header, depending on the message type. The
+ layout of these fields is shown below:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ + +
+ | |
+ + +
+ | Marker |
+ + +
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Length | Type |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Marker:
+
+ This 16-octet field contains a value that the receiver of the
+ message can predict. If the Type of the message is OPEN, or if
+ the OPEN message carries no Authentication Information (as an
+ Optional Parameter), then the Marker must be all ones.
+ Otherwise, the value of the marker can be predicted by some a
+ computation specified as part of the authentication mechanism
+ (which is specified as part of the Authentication Information)
+ used. The Marker can be used to detect loss of synchronization
+ between a pair of BGP peers, and to authenticate incoming BGP
+ messages.
+
+ Length:
+
+ This 2-octet unsigned integer indicates the total length of the
+ message, including the header, in octets. Thus, e.g., it
+ allows one to locate in the transport-level stream the (Marker
+ field of the) next message. The value of the Length field must
+ always be at least 19 and no greater than 4096, and may be
+ further constrained, depending on the message type. No
+ "padding" of extra data after the message is allowed, so the
+ Length field must have the smallest value required given the
+ rest of the message.
+
+
+
+
+
+
+
+Rekhter & Li [Page 7]
+
+RFC 1771 BGP-4 March 1995
+
+
+ Type:
+
+ This 1-octet unsigned integer indicates the type code of the
+ message. The following type codes are defined:
+
+ 1 - OPEN
+ 2 - UPDATE
+ 3 - NOTIFICATION
+ 4 - KEEPALIVE
+
+4.2 OPEN Message Format
+
+ After a transport protocol connection is established, the first
+ message sent by each side is an OPEN message. If the OPEN message is
+ acceptable, a KEEPALIVE message confirming the OPEN is sent back.
+ Once the OPEN is confirmed, UPDATE, KEEPALIVE, and NOTIFICATION
+ messages may be exchanged.
+
+ In addition to the fixed-size BGP header, the OPEN message contains
+ the following fields:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+
+ | Version |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | My Autonomous System |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Hold Time |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | BGP Identifier |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Opt Parm Len |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Optional Parameters |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Version:
+
+ This 1-octet unsigned integer indicates the protocol version
+ number of the message. The current BGP version number is 4.
+
+ My Autonomous System:
+
+ This 2-octet unsigned integer indicates the Autonomous System
+ number of the sender.
+
+
+
+Rekhter & Li [Page 8]
+
+RFC 1771 BGP-4 March 1995
+
+
+ Hold Time:
+
+ This 2-octet unsigned integer indicates the number of seconds
+ that the sender proposes for the value of the Hold Timer. Upon
+ receipt of an OPEN message, a BGP speaker MUST calculate the
+ value of the Hold Timer by using the smaller of its configured
+ Hold Time and the Hold Time received in the OPEN message. The
+ Hold Time MUST be either zero or at least three seconds. An
+ implementation may reject connections on the basis of the Hold
+ Time. The calculated value indicates the maximum number of
+ seconds that may elapse between the receipt of successive
+ KEEPALIVE, and/or UPDATE messages by the sender.
+
+ BGP Identifier:
+
+ This 4-octet unsigned integer indicates the BGP Identifier of
+ the sender. A given BGP speaker sets the value of its BGP
+ Identifier to an IP address assigned to that BGP speaker. The
+ value of the BGP Identifier is determined on startup and is the
+ same for every local interface and every BGP peer.
+
+ Optional Parameters Length:
+
+ This 1-octet unsigned integer indicates the total length of the
+ Optional Parameters field in octets. If the value of this field
+ is zero, no Optional Parameters are present.
+
+ Optional Parameters:
+
+ This field may contain a list of optional parameters, where
+ each parameter is encoded as a <Parameter Type, Parameter
+ Length, Parameter Value> triplet.
+
+ 0 1
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-...
+ | Parm. Type | Parm. Length | Parameter Value (variable)
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-...
+
+ Parameter Type is a one octet field that unambiguously
+ identifies individual parameters. Parameter Length is a one
+ octet field that contains the length of the Parameter Value
+ field in octets. Parameter Value is a variable length field
+ that is interpreted according to the value of the Parameter
+ Type field.
+
+
+
+
+
+
+Rekhter & Li [Page 9]
+
+RFC 1771 BGP-4 March 1995
+
+
+ This document defines the following Optional Parameters:
+
+ a) Authentication Information (Parameter Type 1):
+
+ This optional parameter may be used to authenticate a BGP
+ peer. The Parameter Value field contains a 1-octet
+ Authentication Code followed by a variable length
+ Authentication Data.
+
+ 0 1 2 3 4 5 6 7 8
+ +-+-+-+-+-+-+-+-+
+ | Auth. Code |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | |
+ | Authentication Data |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Authentication Code:
+
+ This 1-octet unsigned integer indicates the
+ authentication mechanism being used. Whenever an
+ authentication mechanism is specified for use within
+ BGP, three things must be included in the
+ specification:
+
+ - the value of the Authentication Code which indicates
+ use of the mechanism,
+ - the form and meaning of the Authentication Data, and
+ - the algorithm for computing values of Marker fields.
+
+ Note that a separate authentication mechanism may be
+ used in establishing the transport level connection.
+
+ Authentication Data:
+
+ The form and meaning of this field is a variable-
+ length field depend on the Authentication Code.
+
+ The minimum length of the OPEN message is 29 octets (including
+ message header).
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 10]
+
+RFC 1771 BGP-4 March 1995
+
+
+4.3 UPDATE Message Format
+
+ UPDATE messages are used to transfer routing information between BGP
+ peers. The information in the UPDATE packet can be used to construct
+ a graph describing the relationships of the various Autonomous
+ Systems. By applying rules to be discussed, routing information
+ loops and some other anomalies may be detected and removed from
+ inter-AS routing.
+
+ An UPDATE message is used to advertise a single feasible route to a
+ peer, or to withdraw multiple unfeasible routes from service (see
+ 3.1). An UPDATE message may simultaneously advertise a feasible route
+ and withdraw multiple unfeasible routes from service. The UPDATE
+ message always includes the fixed-size BGP header, and can optionally
+ include the other fields as shown below:
+
+ +-----------------------------------------------------+
+ | Unfeasible Routes Length (2 octets) |
+ +-----------------------------------------------------+
+ | Withdrawn Routes (variable) |
+ +-----------------------------------------------------+
+ | Total Path Attribute Length (2 octets) |
+ +-----------------------------------------------------+
+ | Path Attributes (variable) |
+ +-----------------------------------------------------+
+ | Network Layer Reachability Information (variable) |
+ +-----------------------------------------------------+
+
+ Unfeasible Routes Length:
+
+ This 2-octets unsigned integer indicates the total length of
+ the Withdrawn Routes field in octets. Its value must allow the
+ length of the Network Layer Reachability Information field to
+ be determined as specified below.
+
+ A value of 0 indicates that no routes are being withdrawn from
+ service, and that the WITHDRAWN ROUTES field is not present in
+ this UPDATE message.
+
+ Withdrawn Routes:
+
+ This is a variable length field that contains a list of IP
+ address prefixes for the routes that are being withdrawn from
+ service. Each IP address prefix is encoded as a 2-tuple of the
+ form <length, prefix>, whose fields are described below:
+
+
+
+
+
+
+Rekhter & Li [Page 11]
+
+RFC 1771 BGP-4 March 1995
+
+
+ +---------------------------+
+ | Length (1 octet) |
+ +---------------------------+
+ | Prefix (variable) |
+ +---------------------------+
+
+ The use and the meaning of these fields are as follows:
+
+ a) Length:
+
+ The Length field indicates the length in bits of the IP
+ address prefix. A length of zero indicates a prefix that
+ matches all IP addresses (with prefix, itself, of zero
+ octets).
+
+ b) Prefix:
+
+ The Prefix field contains IP address prefixes followed by
+ enough trailing bits to make the end of the field fall on an
+ octet boundary. Note that the value of trailing bits is
+ irrelevant.
+
+ Total Path Attribute Length:
+
+ This 2-octet unsigned integer indicates the total length of the
+ Path Attributes field in octets. Its value must allow the
+ length of the Network Layer Reachability field to be determined
+ as specified below.
+
+ A value of 0 indicates that no Network Layer Reachability
+ Information field is present in this UPDATE message.
+
+ Path Attributes:
+
+ A variable length sequence of path attributes is present in
+ every UPDATE. Each path attribute is a triple <attribute type,
+ attribute length, attribute value> of variable length.
+
+ Attribute Type is a two-octet field that consists of the
+ Attribute Flags octet followed by the Attribute Type Code
+ octet.
+
+ 0 1
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Attr. Flags |Attr. Type Code|
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+
+
+
+Rekhter & Li [Page 12]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The high-order bit (bit 0) of the Attribute Flags octet is the
+ Optional bit. It defines whether the attribute is optional (if
+ set to 1) or well-known (if set to 0).
+
+ The second high-order bit (bit 1) of the Attribute Flags octet
+ is the Transitive bit. It defines whether an optional
+ attribute is transitive (if set to 1) or non-transitive (if set
+ to 0). For well-known attributes, the Transitive bit must be
+ set to 1. (See Section 5 for a discussion of transitive
+ attributes.)
+
+ The third high-order bit (bit 2) of the Attribute Flags octet
+ is the Partial bit. It defines whether the information
+ contained in the optional transitive attribute is partial (if
+ set to 1) or complete (if set to 0). For well-known attributes
+ and for optional non-transitive attributes the Partial bit must
+ be set to 0.
+
+ The fourth high-order bit (bit 3) of the Attribute Flags octet
+ is the Extended Length bit. It defines whether the Attribute
+ Length is one octet (if set to 0) or two octets (if set to 1).
+ Extended Length may be used only if the length of the attribute
+ value is greater than 255 octets.
+
+ The lower-order four bits of the Attribute Flags octet are .
+ unused. They must be zero (and must be ignored when received).
+
+ The Attribute Type Code octet contains the Attribute Type Code.
+ Currently defined Attribute Type Codes are discussed in Section
+ 5.
+
+ If the Extended Length bit of the Attribute Flags octet is set
+ to 0, the third octet of the Path Attribute contains the length
+ of the attribute data in octets.
+
+ If the Extended Length bit of the Attribute Flags octet is set
+ to 1, then the third and the fourth octets of the path
+ attribute contain the length of the attribute data in octets.
+
+ The remaining octets of the Path Attribute represent the
+ attribute value and are interpreted according to the Attribute
+ Flags and the Attribute Type Code. The supported Attribute Type
+ Codes, their attribute values and uses are the following:
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 13]
+
+RFC 1771 BGP-4 March 1995
+
+
+ a) ORIGIN (Type Code 1):
+
+ ORIGIN is a well-known mandatory attribute that defines the
+ origin of the path information. The data octet can assume
+ the following values:
+
+ Value Meaning
+
+ 0 IGP - Network Layer Reachability Information
+ is interior to the originating AS
+
+ 1 EGP - Network Layer Reachability Information
+ learned via EGP
+
+ 2 INCOMPLETE - Network Layer Reachability
+ Information learned by some other means
+
+ Its usage is defined in 5.1.1
+
+ b) AS_PATH (Type Code 2):
+
+ AS_PATH is a well-known mandatory attribute that is composed
+ of a sequence of AS path segments. Each AS path segment is
+ represented by a triple <path segment type, path segment
+ length, path segment value>.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 14]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The path segment type is a 1-octet long field with the
+ following values defined:
+
+ Value Segment Type
+
+ 1 AS_SET: unordered set of ASs a route in the
+ UPDATE message has traversed
+
+ 2 AS_SEQUENCE: ordered set of ASs a route in
+ the UPDATE message has traversed
+
+ The path segment length is a 1-octet long field containing
+ the number of ASs in the path segment value field.
+
+ The path segment value field contains one or more AS
+ numbers, each encoded as a 2-octets long field.
+
+ Usage of this attribute is defined in 5.1.2.
+
+ c) NEXT_HOP (Type Code 3):
+
+ This is a well-known mandatory attribute that defines the IP
+ address of the border router that should be used as the next
+ hop to the destinations listed in the Network Layer
+ Reachability field of the UPDATE message.
+
+ Usage of this attribute is defined in 5.1.3.
+
+ d) MULTI_EXIT_DISC (Type Code 4):
+
+ This is an optional non-transitive attribute that is a four
+ octet non-negative integer. The value of this attribute may
+ be used by a BGP speaker's decision process to discriminate
+ among multiple exit points to a neighboring autonomous
+ system.
+
+ Its usage is defined in 5.1.4.
+
+ e) LOCAL_PREF (Type Code 5):
+
+ LOCAL_PREF is a well-known discretionary attribute that is a
+ four octet non-negative integer. It is used by a BGP speaker
+ to inform other BGP speakers in its own autonomous system of
+ the originating speaker's degree of preference for an
+ advertised route. Usage of this attribute is described in
+ 5.1.5.
+
+
+
+
+
+Rekhter & Li [Page 15]
+
+RFC 1771 BGP-4 March 1995
+
+
+ f) ATOMIC_AGGREGATE (Type Code 6)
+
+ ATOMIC_AGGREGATE is a well-known discretionary attribute of
+ length 0. It is used by a BGP speaker to inform other BGP
+ speakers that the local system selected a less specific
+ route without selecting a more specific route which is
+ included in it. Usage of this attribute is described in
+ 5.1.6.
+
+ g) AGGREGATOR (Type Code 7)
+
+ AGGREGATOR is an optional transitive attribute of length 6.
+ The attribute contains the last AS number that formed the
+ aggregate route (encoded as 2 octets), followed by the IP
+ address of the BGP speaker that formed the aggregate route
+ (encoded as 4 octets). Usage of this attribute is described
+ in 5.1.7
+
+ Network Layer Reachability Information:
+
+ This variable length field contains a list of IP address
+ prefixes. The length in octets of the Network Layer
+ Reachability Information is not encoded explicitly, but can be
+ calculated as:
+
+ UPDATE message Length - 23 - Total Path Attributes Length -
+ Unfeasible Routes Length
+
+ where UPDATE message Length is the value encoded in the fixed-
+ size BGP header, Total Path Attribute Length and Unfeasible
+ Routes Length are the values encoded in the variable part of
+ the UPDATE message, and 23 is a combined length of the fixed-
+ size BGP header, the Total Path Attribute Length field and the
+ Unfeasible Routes Length field.
+
+ Reachability information is encoded as one or more 2-tuples of
+ the form <length, prefix>, whose fields are described below:
+
+ +---------------------------+
+ | Length (1 octet) |
+ +---------------------------+
+ | Prefix (variable) |
+ +---------------------------+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 16]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The use and the meaning of these fields are as follows:
+
+ a) Length:
+
+ The Length field indicates the length in bits of the IP
+ address prefix. A length of zero indicates a prefix that
+ matches all IP addresses (with prefix, itself, of zero
+ octets).
+
+ b) Prefix:
+
+ The Prefix field contains IP address prefixes followed by
+ enough trailing bits to make the end of the field fall on an
+ octet boundary. Note that the value of the trailing bits is
+ irrelevant.
+
+ The minimum length of the UPDATE message is 23 octets -- 19 octets
+ for the fixed header + 2 octets for the Unfeasible Routes Length + 2
+ octets for the Total Path Attribute Length (the value of Unfeasible
+ Routes Length is 0 and the value of Total Path Attribute Length is
+ 0).
+
+ An UPDATE message can advertise at most one route, which may be
+ described by several path attributes. All path attributes contained
+ in a given UPDATE messages apply to the destinations carried in the
+ Network Layer Reachability Information field of the UPDATE message.
+
+ An UPDATE message can list multiple routes to be withdrawn from
+ service. Each such route is identified by its destination (expressed
+ as an IP prefix), which unambiguously identifies the route in the
+ context of the BGP speaker - BGP speaker connection to which it has
+ been previously been advertised.
+
+ An UPDATE message may advertise only routes to be withdrawn from
+ service, in which case it will not include path attributes or Network
+ Layer Reachability Information. Conversely, it may advertise only a
+ feasible route, in which case the WITHDRAWN ROUTES field need not be
+ present.
+
+4.4 KEEPALIVE Message Format
+
+ BGP does not use any transport protocol-based keep-alive mechanism to
+ determine if peers are reachable. Instead, KEEPALIVE messages are
+ exchanged between peers often enough as not to cause the Hold Timer
+ to expire. A reasonable maximum time between KEEPALIVE messages
+ would be one third of the Hold Time interval. KEEPALIVE messages
+ MUST NOT be sent more frequently than one per second. An
+ implementation MAY adjust the rate at which it sends KEEPALIVE
+
+
+
+Rekhter & Li [Page 17]
+
+RFC 1771 BGP-4 March 1995
+
+
+ messages as a function of the Hold Time interval.
+
+ If the negotiated Hold Time interval is zero, then periodic KEEPALIVE
+ messages MUST NOT be sent.
+
+ KEEPALIVE message consists of only message header and has a length of
+ 19 octets.
+
+4.5 NOTIFICATION Message Format
+
+ A NOTIFICATION message is sent when an error condition is detected.
+ The BGP connection is closed immediately after sending it.
+
+ In addition to the fixed-size BGP header, the NOTIFICATION message
+ contains the following fields:
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Error code | Error subcode | Data |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ +
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ Error Code:
+
+ This 1-octet unsigned integer indicates the type of
+ NOTIFICATION. The following Error Codes have been defined:
+
+ Error Code Symbolic Name Reference
+
+ 1 Message Header Error Section 6.1
+
+ 2 OPEN Message Error Section 6.2
+
+ 3 UPDATE Message Error Section 6.3
+
+ 4 Hold Timer Expired Section 6.5
+
+ 5 Finite State Machine Error Section 6.6
+
+ 6 Cease Section 6.7
+
+ Error subcode:
+
+ This 1-octet unsigned integer provides more specific
+ information about the nature of the reported error. Each Error
+ Code may have one or more Error Subcodes associated with it.
+
+
+
+Rekhter & Li [Page 18]
+
+RFC 1771 BGP-4 March 1995
+
+
+ If no appropriate Error Subcode is defined, then a zero
+ (Unspecific) value is used for the Error Subcode field.
+
+ Message Header Error subcodes:
+
+ 1 - Connection Not Synchronized.
+ 2 - Bad Message Length.
+ 3 - Bad Message Type.
+
+ OPEN Message Error subcodes:
+
+ 1 - Unsupported Version Number.
+ 2 - Bad Peer AS.
+ 3 - Bad BGP Identifier. '
+ 4 - Unsupported Optional Parameter.
+ 5 - Authentication Failure.
+ 6 - Unacceptable Hold Time.
+
+ UPDATE Message Error subcodes:
+
+ 1 - Malformed Attribute List.
+ 2 - Unrecognized Well-known Attribute.
+ 3 - Missing Well-known Attribute.
+ 4 - Attribute Flags Error.
+ 5 - Attribute Length Error.
+ 6 - Invalid ORIGIN Attribute
+ 7 - AS Routing Loop.
+ 8 - Invalid NEXT_HOP Attribute.
+ 9 - Optional Attribute Error.
+ 10 - Invalid Network Field.
+ 11 - Malformed AS_PATH.
+
+ Data:
+
+ This variable-length field is used to diagnose the reason for
+ the NOTIFICATION. The contents of the Data field depend upon
+ the Error Code and Error Subcode. See Section 6 below for more
+ details.
+
+ Note that the length of the Data field can be determined from
+ the message Length field by the formula:
+
+ Message Length = 21 + Data Length
+
+ The minimum length of the NOTIFICATION message is 21 octets
+ (including message header).
+
+
+
+
+
+Rekhter & Li [Page 19]
+
+RFC 1771 BGP-4 March 1995
+
+
+5. Path Attributes
+
+ This section discusses the path attributes of the UPDATE message.
+
+ Path attributes fall into four separate categories:
+
+ 1. Well-known mandatory.
+ 2. Well-known discretionary.
+ 3. Optional transitive.
+ 4. Optional non-transitive.
+
+ Well-known attributes must be recognized by all BGP implementations.
+ Some of these attributes are mandatory and must be included in every
+ UPDATE message. Others are discretionary and may or may not be sent
+ in a particular UPDATE message.
+
+ All well-known attributes must be passed along (after proper
+ updating, if necessary) to other BGP peers.
+
+ In addition to well-known attributes, each path may contain one or
+ more optional attributes. It is not required or expected that all
+ BGP implementations support all optional attributes. The handling of
+ an unrecognized optional attribute is determined by the setting of
+ the Transitive bit in the attribute flags octet. Paths with
+ unrecognized transitive optional attributes should be accepted. If a
+ path with unrecognized transitive optional attribute is accepted and
+ passed along to other BGP peers, then the unrecognized transitive
+ optional attribute of that path must be passed along with the path to
+ other BGP peers with the Partial bit in the Attribute Flags octet set
+ to 1. If a path with recognized transitive optional attribute is
+ accepted and passed along to other BGP peers and the Partial bit in
+ the Attribute Flags octet is set to 1 by some previous AS, it is not
+ set back to 0 by the current AS. Unrecognized non-transitive optional
+ attributes must be quietly ignored and not passed along to other BGP
+ peers.
+
+ New transitive optional attributes may be attached to the path by the
+ originator or by any other AS in the path. If they are not attached
+ by the originator, the Partial bit in the Attribute Flags octet is
+ set to 1. The rules for attaching new non-transitive optional
+ attributes will depend on the nature of the specific attribute. The
+ documentation of each new non-transitive optional attribute will be
+ expected to include such rules. (The description of the
+ MULTI_EXIT_DISC attribute gives an example.) All optional attributes
+ (both transitive and non-transitive) may be updated (if appropriate)
+ by ASs in the path.
+
+
+
+
+
+Rekhter & Li [Page 20]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The sender of an UPDATE message should order path attributes within
+ the UPDATE message in ascending order of attribute type. The
+ receiver of an UPDATE message must be prepared to handle path
+ attributes within the UPDATE message that are out of order.
+
+ The same attribute cannot appear more than once within the Path
+ Attributes field of a particular UPDATE message.
+
+5.1 Path Attribute Usage
+
+ The usage of each BGP path attributes is described in the following
+ clauses.
+
+5.1.1 ORIGIN
+
+ ORIGIN is a well-known mandatory attribute. The ORIGIN attribute
+ shall be generated by the autonomous system that originates the
+ associated routing information. It shall be included in the UPDATE
+ messages of all BGP speakers that choose to propagate this
+ information to other BGP speakers.
+
+5.1.2 AS_PATH
+
+ AS_PATH is a well-known mandatory attribute. This attribute
+ identifies the autonomous systems through which routing information
+ carried in this UPDATE message has passed. The components of this
+ list can be AS_SETs or AS_SEQUENCEs.
+
+ When a BGP speaker propagates a route which it has learned from
+ another BGP speaker's UPDATE message, it shall modify the route's
+ AS_PATH attribute based on the location of the BGP speaker to which
+ the route will be sent:
+
+ a) When a given BGP speaker advertises the route to another BGP
+ speaker located in its own autonomous system, the advertising
+ speaker shall not modify the AS_PATH attribute associated with the
+ route.
+
+ b) When a given BGP speaker advertises the route to a BGP speaker
+ located in a neighboring autonomous system, then the advertising
+ speaker shall update the AS_PATH attribute as follows:
+
+ 1) if the first path segment of the AS_PATH is of type
+ AS_SEQUENCE, the local system shall prepend its own AS number
+ as the last element of the sequence (put it in the leftmost
+ position).
+
+
+
+
+
+Rekhter & Li [Page 21]
+
+RFC 1771 BGP-4 March 1995
+
+
+ 2) if the first path segment of the AS_PATH is of type AS_SET,
+ the local system shall prepend a new path segment of type
+ AS_SEQUENCE to the AS_PATH, including its own AS number in that
+ segment.
+
+ When a BGP speaker originates a route then:
+
+ a) the originating speaker shall include its own AS number in
+ the AS_PATH attribute of all UPDATE messages sent to BGP
+ speakers located in neighboring autonomous systems. (In this
+ case, the AS number of the originating speaker's autonomous
+ system will be the only entry in the AS_PATH attribute).
+
+ b) the originating speaker shall include an empty AS_PATH
+ attribute in all UPDATE messages sent to BGP speakers located
+ in its own autonomous system. (An empty AS_PATH attribute is
+ one whose length field contains the value zero).
+
+5.1.3 NEXT_HOP
+
+ The NEXT_HOP path attribute defines the IP address of the border
+ router that should be used as the next hop to the destinations listed
+ in the UPDATE message. If a border router belongs to the same AS as
+ its peer, then the peer is an internal border router. Otherwise, it
+ is an external border router. A BGP speaker can advertise any
+ internal border router as the next hop provided that the interface
+ associated with the IP address of this border router (as specified in
+ the NEXT_HOP path attribute) shares a common subnet with both the
+ local and remote BGP speakers. A BGP speaker can advertise any
+ external border router as the next hop, provided that the IP address
+ of this border router was learned from one of the BGP speaker's
+ peers, and the interface associated with the IP address of this
+ border router (as specified in the NEXT_HOP path attribute) shares a
+ common subnet with the local and remote BGP speakers. A BGP speaker
+ needs to be able to support disabling advertisement of external
+ border routers.
+
+ A BGP speaker must never advertise an address of a peer to that peer
+ as a NEXT_HOP, for a route that the speaker is originating. A BGP
+ speaker must never install a route with itself as the next hop.
+
+ When a BGP speaker advertises the route to a BGP speaker located in
+ its own autonomous system, the advertising speaker shall not modify
+ the NEXT_HOP attribute associated with the route. When a BGP speaker
+ receives the route via an internal link, it may forward packets to
+ the NEXT_HOP address if the address contained in the attribute is on
+ a common subnet with the local and remote BGP speakers.
+
+
+
+
+Rekhter & Li [Page 22]
+
+RFC 1771 BGP-4 March 1995
+
+
+5.1.4 MULTI_EXIT_DISC
+
+ The MULTI_EXIT_DISC attribute may be used on external (inter-AS)
+ links to discriminate among multiple exit or entry points to the same
+ neighboring AS. The value of the MULTI_EXIT_DISC attribute is a four
+ octet unsigned number which is called a metric. All other factors
+ being equal, the exit or entry point with lower metric should be
+ preferred. If received over external links, the MULTI_EXIT_DISC
+ attribute may be propagated over internal links to other BGP speakers
+ within the same AS. The MULTI_EXIT_DISC attribute is never
+ propagated to other BGP speakers in neighboring AS's.
+
+5.1.5 LOCAL_PREF
+
+ LOCAL_PREF is a well-known discretionary attribute that shall be
+ included in all UPDATE messages that a given BGP speaker sends to the
+ other BGP speakers located in its own autonomous system. A BGP
+ speaker shall calculate the degree of preference for each external
+ route and include the degree of preference when advertising a route
+ to its internal peers. The higher degree of preference should be
+ preferred. A BGP speaker shall use the degree of preference learned
+ via LOCAL_PREF in its decision process (see section 9.1.1).
+
+ A BGP speaker shall not include this attribute in UPDATE messages
+ that it sends to BGP speakers located in a neighboring autonomous
+ system. If it is contained in an UPDATE message that is received from
+ a BGP speaker which is not located in the same autonomous system as
+ the receiving speaker, then this attribute shall be ignored by the
+ receiving speaker.
+
+5.1.6 ATOMIC_AGGREGATE
+
+ ATOMIC_AGGREGATE is a well-known discretionary attribute. If a BGP
+ speaker, when presented with a set of overlapping routes from one of
+ its peers (see 9.1.4), selects the less specific route without
+ selecting the more specific one, then the local system shall attach
+ the ATOMIC_AGGREGATE attribute to the route when propagating it to
+ other BGP speakers (if that attribute is not already present in the
+ received less specific route). A BGP speaker that receives a route
+ with the ATOMIC_AGGREGATE attribute shall not remove the attribute
+ from the route when propagating it to other speakers. A BGP speaker
+ that receives a route with the ATOMIC_AGGREGATE attribute shall not
+ make any NLRI of that route more specific (as defined in 9.1.4) when
+ advertising this route to other BGP speakers. A BGP speaker that
+ receives a route with the ATOMIC_AGGREGATE attribute needs to be
+ cognizant of the fact that the actual path to destinations, as
+ specified in the NLRI of the route, while having the loop-free
+ property, may traverse ASs that are not listed in the AS_PATH
+
+
+
+Rekhter & Li [Page 23]
+
+RFC 1771 BGP-4 March 1995
+
+
+ attribute.
+
+5.1.7 AGGREGATOR
+
+ AGGREGATOR is an optional transitive attribute which may be included
+ in updates which are formed by aggregation (see Section 9.2.4.2). A
+ BGP speaker which performs route aggregation may add the AGGREGATOR
+ attribute which shall contain its own AS number and IP address.
+
+6. BGP Error Handling.
+
+ This section describes actions to be taken when errors are detected
+ while processing BGP messages.
+
+ When any of the conditions described here are detected, a
+ NOTIFICATION message with the indicated Error Code, Error Subcode,
+ and Data fields is sent, and the BGP connection is closed. If no
+ Error Subcode is specified, then a zero must be used.
+
+ The phrase "the BGP connection is closed" means that the transport
+ protocol connection has been closed and that all resources for that
+ BGP connection have been deallocated. Routing table entries
+ associated with the remote peer are marked as invalid. The fact that
+ the routes have become invalid is passed to other BGP peers before
+ the routes are deleted from the system.
+
+ Unless specified explicitly, the Data field of the NOTIFICATION
+ message that is sent to indicate an error is empty.
+
+6.1 Message Header error handling.
+
+ All errors detected while processing the Message Header are indicated
+ by sending the NOTIFICATION message with Error Code Message Header
+ Error. The Error Subcode elaborates on the specific nature of the
+ error.
+
+ The expected value of the Marker field of the message header is all
+ ones if the message type is OPEN. The expected value of the Marker
+ field for all other types of BGP messages determined based on the
+ presence of the Authentication Information Optional Parameter in the
+ BGP OPEN message and the actual authentication mechanism (if the
+ Authentication Information in the BGP OPEN message is present). If
+ the Marker field of the message header is not the expected one, then
+ a synchronization error has occurred and the Error Subcode is set to
+ Connection Not Synchronized.
+
+
+
+
+
+
+Rekhter & Li [Page 24]
+
+RFC 1771 BGP-4 March 1995
+
+
+ If the Length field of the message header is less than 19 or greater
+ than 4096, or if the Length field of an OPEN message is less than
+ the minimum length of the OPEN message, or if the Length field of an
+ UPDATE message is less than the minimum length of the UPDATE message,
+ or if the Length field of a KEEPALIVE message is not equal to 19, or
+ if the Length field of a NOTIFICATION message is less than the
+ minimum length of the NOTIFICATION message, then the Error Subcode is
+ set to Bad Message Length. The Data field contains the erroneous
+ Length field.
+
+ If the Type field of the message header is not recognized, then the
+ Error Subcode is set to Bad Message Type. The Data field contains
+ the erroneous Type field.
+
+6.2 OPEN message error handling.
+
+ All errors detected while processing the OPEN message are indicated
+ by sending the NOTIFICATION message with Error Code OPEN Message
+ Error. The Error Subcode elaborates on the specific nature of the
+ error.
+
+ If the version number contained in the Version field of the received
+ OPEN message is not supported, then the Error Subcode is set to
+ Unsupported Version Number. The Data field is a 2-octet unsigned
+ integer, which indicates the largest locally supported version number
+ less than the version the remote BGP peer bid (as indicated in the
+ received OPEN message).
+
+ If the Autonomous System field of the OPEN message is unacceptable,
+ then the Error Subcode is set to Bad Peer AS. The determination of
+ acceptable Autonomous System numbers is outside the scope of this
+ protocol.
+
+ If the Hold Time field of the OPEN message is unacceptable, then the
+ Error Subcode MUST be set to Unacceptable Hold Time. An
+ implementation MUST reject Hold Time values of one or two seconds.
+ An implementation MAY reject any proposed Hold Time. An
+ implementation which accepts a Hold Time MUST use the negotiated
+ value for the Hold Time.
+
+ If the BGP Identifier field of the OPEN message is syntactically
+ incorrect, then the Error Subcode is set to Bad BGP Identifier.
+ Syntactic correctness means that the BGP Identifier field represents
+ a valid IP host address.
+
+ If one of the Optional Parameters in the OPEN message is not
+ recognized, then the Error Subcode is set to Unsupported Optional
+ Parameters.
+
+
+
+Rekhter & Li [Page 25]
+
+RFC 1771 BGP-4 March 1995
+
+
+ If the OPEN message carries Authentication Information (as an
+ Optional Parameter), then the corresponding authentication procedure
+ is invoked. If the authentication procedure (based on Authentication
+ Code and Authentication Data) fails, then the Error Subcode is set to
+ Authentication Failure.
+
+6.3 UPDATE message error handling.
+
+ All errors detected while processing the UPDATE message are indicated
+ by sending the NOTIFICATION message with Error Code UPDATE Message
+ Error. The error subcode elaborates on the specific nature of the
+ error.
+
+ Error checking of an UPDATE message begins by examining the path
+ attributes. If the Unfeasible Routes Length or Total Attribute
+ Length is too large (i.e., if Unfeasible Routes Length + Total
+ Attribute Length + 23 exceeds the message Length), then the Error
+ Subcode is set to Malformed Attribute List.
+
+ If any recognized attribute has Attribute Flags that conflict with
+ the Attribute Type Code, then the Error Subcode is set to Attribute
+ Flags Error. The Data field contains the erroneous attribute (type,
+ length and value).
+
+ If any recognized attribute has Attribute Length that conflicts with
+ the expected length (based on the attribute type code), then the
+ Error Subcode is set to Attribute Length Error. The Data field
+ contains the erroneous attribute (type, length and value).
+
+ If any of the mandatory well-known attributes are not present, then
+ the Error Subcode is set to Missing Well-known Attribute. The Data
+ field contains the Attribute Type Code of the missing well-known
+ attribute.
+
+ If any of the mandatory well-known attributes are not recognized,
+ then the Error Subcode is set to Unrecognized Well-known Attribute.
+ The Data field contains the unrecognized attribute (type, length and
+ value).
+
+ If the ORIGIN attribute has an undefined value, then the Error
+ Subcode is set to Invalid Origin Attribute. The Data field contains
+ the unrecognized attribute (type, length and value).
+
+ If the NEXT_HOP attribute field is syntactically incorrect, then the
+ Error Subcode is set to Invalid NEXT_HOP Attribute. The Data field
+ contains the incorrect attribute (type, length and value). Syntactic
+ correctness means that the NEXT_HOP attribute represents a valid IP
+ host address. Semantic correctness applies only to the external BGP
+
+
+
+Rekhter & Li [Page 26]
+
+RFC 1771 BGP-4 March 1995
+
+
+ links. It means that the interface associated with the IP address, as
+ specified in the NEXT_HOP attribute, shares a common subnet with the
+ receiving BGP speaker and is not the IP address of the receiving BGP
+ speaker. If the NEXT_HOP attribute is semantically incorrect, the
+ error should be logged, and the the route should be ignored. In this
+ case, no NOTIFICATION message should be sent.
+
+ The AS_PATH attribute is checked for syntactic correctness. If the
+ path is syntactically incorrect, then the Error Subcode is set to
+ Malformed AS_PATH.
+
+ If an optional attribute is recognized, then the value of this
+ attribute is checked. If an error is detected, the attribute is
+ discarded, and the Error Subcode is set to Optional Attribute Error.
+ The Data field contains the attribute (type, length and value).
+
+ If any attribute appears more than once in the UPDATE message, then
+ the Error Subcode is set to Malformed Attribute List.
+
+ The NLRI field in the UPDATE message is checked for syntactic
+ validity. If the field is syntactically incorrect, then the Error
+ Subcode is set to Invalid Network Field.
+
+6.4 NOTIFICATION message error handling.
+
+ If a peer sends a NOTIFICATION message, and there is an error in that
+ message, there is unfortunately no means of reporting this error via
+ a subsequent NOTIFICATION message. Any such error, such as an
+ unrecognized Error Code or Error Subcode, should be noticed, logged
+ locally, and brought to the attention of the administration of the
+ peer. The means to do this, however, lies outside the scope of this
+ document.
+
+6.5 Hold Timer Expired error handling.
+
+ If a system does not receive successive KEEPALIVE and/or UPDATE
+ and/or NOTIFICATION messages within the period specified in the Hold
+ Time field of the OPEN message, then the NOTIFICATION message with
+ Hold Timer Expired Error Code must be sent and the BGP connection
+ closed.
+
+6.6 Finite State Machine error handling.
+
+ Any error detected by the BGP Finite State Machine (e.g., receipt of
+ an unexpected event) is indicated by sending the NOTIFICATION message
+ with Error Code Finite State Machine Error.
+
+
+
+
+
+Rekhter & Li [Page 27]
+
+RFC 1771 BGP-4 March 1995
+
+
+6.7 Cease.
+
+ In absence of any fatal errors (that are indicated in this section),
+ a BGP peer may choose at any given time to close its BGP connection
+ by sending the NOTIFICATION message with Error Code Cease. However,
+ the Cease NOTIFICATION message must not be used when a fatal error
+ indicated by this section does exist.
+
+6.8 Connection collision detection.
+
+ If a pair of BGP speakers try simultaneously to establish a TCP
+ connection to each other, then two parallel connections between this
+ pair of speakers might well be formed. We refer to this situation as
+ connection collision. Clearly, one of these connections must be
+ closed.
+
+ Based on the value of the BGP Identifier a convention is established
+ for detecting which BGP connection is to be preserved when a
+ collision does occur. The convention is to compare the BGP
+ Identifiers of the peers involved in the collision and to retain only
+ the connection initiated by the BGP speaker with the higher-valued
+ BGP Identifier.
+
+ Upon receipt of an OPEN message, the local system must examine all of
+ its connections that are in the OpenConfirm state. A BGP speaker may
+ also examine connections in an OpenSent state if it knows the BGP
+ Identifier of the peer by means outside of the protocol. If among
+ these connections there is a connection to a remote BGP speaker whose
+ BGP Identifier equals the one in the OPEN message, then the local
+ system performs the following collision resolution procedure:
+
+ 1. The BGP Identifier of the local system is compared to the BGP
+ Identifier of the remote system (as specified in the OPEN
+ message).
+
+ 2. If the value of the local BGP Identifier is less than the
+ remote one, the local system closes BGP connection that already
+ exists (the one that is already in the OpenConfirm state), and
+ accepts BGP connection initiated by the remote system.
+
+ 3. Otherwise, the local system closes newly created BGP connection
+ (the one associated with the newly received OPEN message), and
+ continues to use the existing one (the one that is already in the
+ OpenConfirm state).
+
+ Comparing BGP Identifiers is done by treating them as (4-octet
+ long) unsigned integers.
+
+
+
+
+Rekhter & Li [Page 28]
+
+RFC 1771 BGP-4 March 1995
+
+
+ A connection collision with an existing BGP connection that is in
+ Established states causes unconditional closing of the newly
+ created connection. Note that a connection collision cannot be
+ detected with connections that are in Idle, or Connect, or Active
+ states.
+
+ Closing the BGP connection (that results from the collision
+ resolution procedure) is accomplished by sending the NOTIFICATION
+ message with the Error Code Cease.
+
+7. BGP Version Negotiation.
+
+ BGP speakers may negotiate the version of the protocol by making
+ multiple attempts to open a BGP connection, starting with the highest
+ version number each supports. If an open attempt fails with an Error
+ Code OPEN Message Error, and an Error Subcode Unsupported Version
+ Number, then the BGP speaker has available the version number it
+ tried, the version number its peer tried, the version number passed
+ by its peer in the NOTIFICATION message, and the version numbers that
+ it supports. If the two peers do support one or more common
+ versions, then this will allow them to rapidly determine the highest
+ common version. In order to support BGP version negotiation, future
+ versions of BGP must retain the format of the OPEN and NOTIFICATION
+ messages.
+
+8. BGP Finite State machine.
+
+ This section specifies BGP operation in terms of a Finite State
+ Machine (FSM). Following is a brief summary and overview of BGP
+ operations by state as determined by this FSM. A condensed version
+ of the BGP FSM is found in Appendix 1.
+
+ Initially BGP is in the Idle state.
+
+ Idle state:
+
+ In this state BGP refuses all incoming BGP connections. No
+ resources are allocated to the peer. In response to the Start
+ event (initiated by either system or operator) the local system
+ initializes all BGP resources, starts the ConnectRetry timer,
+ initiates a transport connection to other BGP peer, while
+ listening for connection that may be initiated by the remote
+ BGP peer, and changes its state to Connect. The exact value of
+ the ConnectRetry timer is a local matter, but should be
+ sufficiently large to allow TCP initialization.
+
+ If a BGP speaker detects an error, it shuts down the connection
+ and changes its state to Idle. Getting out of the Idle state
+
+
+
+Rekhter & Li [Page 29]
+
+RFC 1771 BGP-4 March 1995
+
+
+ requires generation of the Start event. If such an event is
+ generated automatically, then persistent BGP errors may result
+ in persistent flapping of the speaker. To avoid such a
+ condition it is recommended that Start events should not be
+ generated immediately for a peer that was previously
+ transitioned to Idle due to an error. For a peer that was
+ previously transitioned to Idle due to an error, the time
+ between consecutive generation of Start events, if such events
+ are generated automatically, shall exponentially increase. The
+ value of the initial timer shall be 60 seconds. The time shall
+ be doubled for each consecutive retry.
+
+ Any other event received in the Idle state is ignored.
+
+ Connect state:
+
+ In this state BGP is waiting for the transport protocol
+ connection to be completed.
+
+ If the transport protocol connection succeeds, the local system
+ clears the ConnectRetry timer, completes initialization, sends
+ an OPEN message to its peer, and changes its state to OpenSent.
+
+ If the transport protocol connect fails (e.g., retransmission
+ timeout), the local system restarts the ConnectRetry timer,
+ continues to listen for a connection that may be initiated by
+ the remote BGP peer, and changes its state to Active state.
+
+ In response to the ConnectRetry timer expired event, the local
+ system restarts the ConnectRetry timer, initiates a transport
+ connection to other BGP peer, continues to listen for a
+ connection that may be initiated by the remote BGP peer, and
+ stays in the Connect state.
+
+ Start event is ignored in the Active state.
+
+ In response to any other event (initiated by either system or
+ operator), the local system releases all BGP resources
+ associated with this connection and changes its state to Idle.
+
+ Active state:
+
+ In this state BGP is trying to acquire a peer by initiating a
+ transport protocol connection.
+
+ If the transport protocol connection succeeds, the local system
+ clears the ConnectRetry timer, completes initialization, sends
+ an OPEN message to its peer, sets its Hold Timer to a large
+
+
+
+Rekhter & Li [Page 30]
+
+RFC 1771 BGP-4 March 1995
+
+
+ value, and changes its state to OpenSent. A Hold Timer value
+ of 4 minutes is suggested.
+
+ In response to the ConnectRetry timer expired event, the local
+ system restarts the ConnectRetry timer, initiates a transport
+ connection to other BGP peer, continues to listen for a
+ connection that may be initiated by the remote BGP peer, and
+ changes its state to Connect.
+
+ If the local system detects that a remote peer is trying to
+ establish BGP connection to it, and the IP address of the
+ remote peer is not an expected one, the local system restarts
+ the ConnectRetry timer, rejects the attempted connection,
+ continues to listen for a connection that may be initiated by
+ the remote BGP peer, and stays in the Active state.
+
+ Start event is ignored in the Active state.
+
+ In response to any other event (initiated by either system or
+ operator), the local system releases all BGP resources
+ associated with this connection and changes its state to Idle.
+
+ OpenSent state:
+
+ In this state BGP waits for an OPEN message from its peer.
+ When an OPEN message is received, all fields are checked for
+ correctness. If the BGP message header checking or OPEN
+ message checking detects an error (see Section 6.2), or a
+ connection collision (see Section 6.8) the local system sends a
+ NOTIFICATION message and changes its state to Idle.
+
+ If there are no errors in the OPEN message, BGP sends a
+ KEEPALIVE message and sets a KeepAlive timer. The Hold Timer,
+ which was originally set to a large value (see above), is
+ replaced with the negotiated Hold Time value (see section 4.2).
+ If the negotiated Hold Time value is zero, then the Hold Time
+ timer and KeepAlive timers are not started. If the value of
+ the Autonomous System field is the same as the local Autonomous
+ System number, then the connection is an "internal" connection;
+ otherwise, it is "external". (This will effect UPDATE
+ processing as described below.) Finally, the state is changed
+ to OpenConfirm.
+
+ If a disconnect notification is received from the underlying
+ transport protocol, the local system closes the BGP connection,
+ restarts the ConnectRetry timer, while continue listening for
+ connection that may be initiated by the remote BGP peer, and
+ goes into the Active state.
+
+
+
+Rekhter & Li [Page 31]
+
+RFC 1771 BGP-4 March 1995
+
+
+ If the Hold Timer expires, the local system sends NOTIFICATION
+ message with error code Hold Timer Expired and changes its
+ state to Idle.
+
+ In response to the Stop event (initiated by either system or
+ operator) the local system sends NOTIFICATION message with
+ Error Code Cease and changes its state to Idle.
+
+ Start event is ignored in the OpenSent state.
+
+ In response to any other event the local system sends
+ NOTIFICATION message with Error Code Finite State Machine Error
+ and changes its state to Idle.
+
+ Whenever BGP changes its state from OpenSent to Idle, it closes
+ the BGP (and transport-level) connection and releases all
+ resources associated with that connection.
+
+ OpenConfirm state:
+
+ In this state BGP waits for a KEEPALIVE or NOTIFICATION
+ message.
+
+ If the local system receives a KEEPALIVE message, it changes
+ its state to Established.
+
+ If the Hold Timer expires before a KEEPALIVE message is
+ received, the local system sends NOTIFICATION message with
+ error code Hold Timer Expired and changes its state to Idle.
+
+ If the local system receives a NOTIFICATION message, it changes
+ its state to Idle.
+
+ If the KeepAlive timer expires, the local system sends a
+ KEEPALIVE message and restarts its KeepAlive timer.
+
+ If a disconnect notification is received from the underlying
+ transport protocol, the local system changes its state to Idle.
+
+ In response to the Stop event (initiated by either system or
+ operator) the local system sends NOTIFICATION message with
+ Error Code Cease and changes its state to Idle.
+
+ Start event is ignored in the OpenConfirm state.
+
+ In response to any other event the local system sends
+ NOTIFICATION message with Error Code Finite State Machine Error
+ and changes its state to Idle.
+
+
+
+Rekhter & Li [Page 32]
+
+RFC 1771 BGP-4 March 1995
+
+
+ Whenever BGP changes its state from OpenConfirm to Idle, it
+ closes the BGP (and transport-level) connection and releases
+ all resources associated with that connection.
+
+ Established state:
+
+ In the Established state BGP can exchange UPDATE, NOTIFICATION,
+ and KEEPALIVE messages with its peer.
+
+ If the local system receives an UPDATE or KEEPALIVE message, it
+ restarts its Hold Timer, if the negotiated Hold Time value is
+ non-zero.
+
+ If the local system receives a NOTIFICATION message, it changes
+ its state to Idle.
+
+ If the local system receives an UPDATE message and the UPDATE
+ message error handling procedure (see Section 6.3) detects an
+ error, the local system sends a NOTIFICATION message and
+ changes its state to Idle.
+
+ If a disconnect notification is received from the underlying
+ transport protocol, the local system changes its state to Idle.
+
+ If the Hold Timer expires, the local system sends a
+ NOTIFICATION message with Error Code Hold Timer Expired and
+ changes its state to Idle.
+
+ If the KeepAlive timer expires, the local system sends a
+ KEEPALIVE message and restarts its KeepAlive timer.
+
+ Each time the local system sends a KEEPALIVE or UPDATE message,
+ it restarts its KeepAlive timer, unless the negotiated Hold
+ Time value is zero.
+
+ In response to the Stop event (initiated by either system or
+ operator), the local system sends a NOTIFICATION message with
+ Error Code Cease and changes its state to Idle.
+
+ Start event is ignored in the Established state.
+
+ In response to any other event, the local system sends
+ NOTIFICATION message with Error Code Finite State Machine Error
+ and changes its state to Idle.
+
+ Whenever BGP changes its state from Established to Idle, it
+ closes the BGP (and transport-level) connection, releases all
+ resources associated with that connection, and deletes all
+
+
+
+Rekhter & Li [Page 33]
+
+RFC 1771 BGP-4 March 1995
+
+
+ routes derived from that connection.
+
+9. UPDATE Message Handling
+
+ An UPDATE message may be received only in the Established state.
+ When an UPDATE message is received, each field is checked for
+ validity as specified in Section 6.3.
+
+ If an optional non-transitive attribute is unrecognized, it is
+ quietly ignored. If an optional transitive attribute is
+ unrecognized, the Partial bit (the third high-order bit) in the
+ attribute flags octet is set to 1, and the attribute is retained for
+ propagation to other BGP speakers.
+
+ If an optional attribute is recognized, and has a valid value, then,
+ depending on the type of the optional attribute, it is processed
+ locally, retained, and updated, if necessary, for possible
+ propagation to other BGP speakers.
+
+ If the UPDATE message contains a non-empty WITHDRAWN ROUTES field,
+ the previously advertised routes whose destinations (expressed as IP
+ prefixes) contained in this field shall be removed from the Adj-RIB-
+ In. This BGP speaker shall run its Decision Process since the
+ previously advertised route is not longer available for use.
+
+ If the UPDATE message contains a feasible route, it shall be placed
+ in the appropriate Adj-RIB-In, and the following additional actions
+ shall be taken:
+
+ i) If its Network Layer Reachability Information (NLRI) is identical
+ to the one of a route currently stored in the Adj-RIB-In, then the
+ new route shall replace the older route in the Adj-RIB-In, thus
+ implicitly withdrawing the older route from service. The BGP speaker
+ shall run its Decision Process since the older route is no longer
+ available for use.
+
+ ii) If the new route is an overlapping route that is included (see
+ 9.1.4) in an earlier route contained in the Adj-RIB-In, the BGP
+ speaker shall run its Decision Process since the more specific route
+ has implicitly made a portion of the less specific route unavailable
+ for use.
+
+ iii) If the new route has identical path attributes to an earlier
+ route contained in the Adj-RIB-In, and is more specific (see 9.1.4)
+ than the earlier route, no further actions are necessary.
+
+ iv) If the new route has NLRI that is not present in any of the
+ routes currently stored in the Adj-RIB-In, then the new route shall
+
+
+
+Rekhter & Li [Page 34]
+
+RFC 1771 BGP-4 March 1995
+
+
+ be placed in the Adj-RIB-In. The BGP speaker shall run its Decision
+ Process.
+
+ v) If the new route is an overlapping route that is less specific
+ (see 9.1.4) than an earlier route contained in the Adj-RIB-In, the
+ BGP speaker shall run its Decision Process on the set of destinations
+ described only by the less specific route.
+
+9.1 Decision Process
+
+ The Decision Process selects routes for subsequent advertisement by
+ applying the policies in the local Policy Information Base (PIB) to
+ the routes stored in its Adj-RIB-In. The output of the Decision
+ Process is the set of routes that will be advertised to all peers;
+ the selected routes will be stored in the local speaker's Adj-RIB-
+ Out.
+
+ The selection process is formalized by defining a function that takes
+ the attribute of a given route as an argument and returns a non-
+ negative integer denoting the degree of preference for the route.
+ The function that calculates the degree of preference for a given
+ route shall not use as its inputs any of the following: the
+ existence of other routes, the non-existence of other routes, or the
+ path attributes of other routes. Route selection then consists of
+ individual application of the degree of preference function to each
+ feasible route, followed by the choice of the one with the highest
+ degree of preference.
+
+ The Decision Process operates on routes contained in each Adj-RIB-In,
+ and is responsible for:
+
+ - selection of routes to be advertised to BGP speakers located in
+ the local speaker's autonomous system
+
+ - selection of routes to be advertised to BGP speakers located in
+ neighboring autonomous systems
+
+ - route aggregation and route information reduction
+
+ The Decision Process takes place in three distinct phases, each
+ triggered by a different event:
+
+ a) Phase 1 is responsible for calculating the degree of preference
+ for each route received from a BGP speaker located in a
+ neighboring autonomous system, and for advertising to the other
+ BGP speakers in the local autonomous system the routes that have
+ the highest degree of preference for each distinct destination.
+
+
+
+
+Rekhter & Li [Page 35]
+
+RFC 1771 BGP-4 March 1995
+
+
+ b) Phase 2 is invoked on completion of phase 1. It is responsible
+ for choosing the best route out of all those available for each
+ distinct destination, and for installing each chosen route into
+ the appropriate Loc-RIB.
+
+ c) Phase 3 is invoked after the Loc-RIB has been modified. It is
+ responsible for disseminating routes in the Loc-RIB to each peer
+ located in a neighboring autonomous system, according to the
+ policies contained in the PIB. Route aggregation and information
+ reduction can optionally be performed within this phase.
+
+9.1.1 Phase 1: Calculation of Degree of Preference
+
+ The Phase 1 decision function shall be invoked whenever the local BGP
+ speaker receives an UPDATE message from a peer located in a
+ neighboring autonomous system that advertises a new route, a
+ replacement route, or a withdrawn route.
+
+ The Phase 1 decision function is a separate process which completes
+ when it has no further work to do.
+
+ The Phase 1 decision function shall lock an Adj-RIB-In prior to
+ operating on any route contained within it, and shall unlock it after
+ operating on all new or unfeasible routes contained within it.
+
+ For each newly received or replacement feasible route, the local BGP
+ speaker shall determine a degree of preference. If the route is
+ learned from a BGP speaker in the local autonomous system, either the
+ value of the LOCAL_PREF attribute shall be taken as the degree of
+ preference, or the local system shall compute the degree of
+ preference of the route based on preconfigured policy information. If
+ the route is learned from a BGP speaker in a neighboring autonomous
+ system, then the degree of preference shall be computed based on
+ preconfigured policy information. The exact nature of this policy
+ information and the computation involved is a local matter. The
+ local speaker shall then run the internal update process of 9.2.1 to
+ select and advertise the most preferable route.
+
+9.1.2 Phase 2: Route Selection
+
+ The Phase 2 decision function shall be invoked on completion of Phase
+ 1. The Phase 2 function is a separate process which completes when
+ it has no further work to do. The Phase 2 process shall consider all
+ routes that are present in the Adj-RIBs-In, including those received
+ from BGP speakers located in its own autonomous system and those
+ received from BGP speakers located in neighboring autonomous systems.
+
+
+
+
+
+Rekhter & Li [Page 36]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The Phase 2 decision function shall be blocked from running while the
+ Phase 3 decision function is in process. The Phase 2 function shall
+ lock all Adj-RIBs-In prior to commencing its function, and shall
+ unlock them on completion.
+
+ If the NEXT_HOP attribute of a BGP route depicts an address to which
+ the local BGP speaker doesn't have a route in its Loc-RIB, the BGP
+ route SHOULD be excluded from the Phase 2 decision function.
+
+ For each set of destinations for which a feasible route exists in the
+ Adj-RIBs-In, the local BGP speaker shall identify the route that has:
+
+ a) the highest degree of preference of any route to the same set
+ of destinations, or
+
+ b) is the only route to that destination, or
+
+ c) is selected as a result of the Phase 2 tie breaking rules
+ specified in 9.1.2.1.
+
+ The local speaker SHALL then install that route in the Loc-RIB,
+ replacing any route to the same destination that is currently being
+ held in the Loc-RIB. The local speaker MUST determine the immediate
+ next hop to the address depicted by the NEXT_HOP attribute of the
+ selected route by performing a lookup in the IGP and selecting one of
+ the possible paths in the IGP. This immediate next hop MUST be used
+ when installing the selected route in the Loc-RIB. If the route to
+ the address depicted by the NEXT_HOP attribute changes such that the
+ immediate next hop changes, route selection should be recalculated as
+ specified above.
+
+ Unfeasible routes shall be removed from the Loc-RIB, and
+ corresponding unfeasible routes shall then be removed from the Adj-
+ RIBs-In.
+
+9.1.2.1 Breaking Ties (Phase 2)
+
+ In its Adj-RIBs-In a BGP speaker may have several routes to the same
+ destination that have the same degree of preference. The local
+ speaker can select only one of these routes for inclusion in the
+ associated Loc-RIB. The local speaker considers all equally
+ preferable routes, both those received from BGP speakers located in
+ neighboring autonomous systems, and those received from other BGP
+ speakers located in the local speaker's autonomous system.
+
+ The following tie-breaking procedure assumes that for each candidate
+ route all the BGP speakers within an autonomous system can ascertain
+ the cost of a path (interior distance) to the address depicted by the
+
+
+
+Rekhter & Li [Page 37]
+
+RFC 1771 BGP-4 March 1995
+
+
+ NEXT_HOP attribute of the route. Ties shall be broken according to
+ the following algorithm:
+
+ a) If the local system is configured to take into account
+ MULTI_EXIT_DISC, and the candidate routes differ in their
+ MULTI_EXIT_DISC attribute, select the route that has the lowest
+ value of the MULTI_EXIT_DISC attribute.
+
+ b) Otherwise, select the route that has the lowest cost (interior
+ distance) to the entity depicted by the NEXT_HOP attribute of the
+ route. If there are several routes with the same cost, then the
+ tie-breaking shall be broken as follows:
+
+ - if at least one of the candidate routes was advertised by the
+ BGP speaker in a neighboring autonomous system, select the
+ route that was advertised by the BGP speaker in a neighboring
+ autonomous system whose BGP Identifier has the lowest value
+ among all other BGP speakers in neighboring autonomous systems;
+
+ - otherwise, select the route that was advertised by the BGP
+ speaker whose BGP Identifier has the lowest value.
+
+9.1.3 Phase 3: Route Dissemination
+
+ The Phase 3 decision function shall be invoked on completion of Phase
+ 2, or when any of the following events occur:
+
+ a) when routes in a Loc-RIB to local destinations have changed
+
+ b) when locally generated routes learned by means outside of BGP
+ have changed
+
+ c) when a new BGP speaker - BGP speaker connection has been
+ established
+
+ The Phase 3 function is a separate process which completes when it
+ has no further work to do. The Phase 3 Routing Decision function
+ shall be blocked from running while the Phase 2 decision function is
+ in process.
+
+ All routes in the Loc-RIB shall be processed into a corresponding
+ entry in the associated Adj-RIBs-Out. Route aggregation and
+ information reduction techniques (see 9.2.4.1) may optionally be
+ applied.
+
+ For the benefit of future support of inter-AS multicast capabilities,
+ a BGP speaker that participates in inter-AS multicast routing shall
+ advertise a route it receives from one of its external peers and if
+
+
+
+Rekhter & Li [Page 38]
+
+RFC 1771 BGP-4 March 1995
+
+
+ it installs it in its Loc-RIB, it shall advertise it back to the peer
+ from which the route was received. For a BGP speaker that does not
+ participate in inter-AS multicast routing such an advertisement is
+ optional. When doing such an advertisement, the NEXT_HOP attribute
+ should be set to the address of the peer. An implementation may also
+ optimize such an advertisement by truncating information in the
+ AS_PATH attribute to include only its own AS number and that of the
+ peer that advertised the route (such truncation requires the ORIGIN
+ attribute to be set to INCOMPLETE). In addition an implementation is
+ not required to pass optional or discretionary path attributes with
+ such an advertisement.
+
+ When the updating of the Adj-RIBs-Out and the Forwarding Information
+ Base (FIB) is complete, the local BGP speaker shall run the external
+ update process of 9.2.2.
+
+9.1.4 Overlapping Routes
+
+ A BGP speaker may transmit routes with overlapping Network Layer
+ Reachability Information (NLRI) to another BGP speaker. NLRI overlap
+ occurs when a set of destinations are identified in non-matching
+ multiple routes. Since BGP encodes NLRI using IP prefixes, overlap
+ will always exhibit subset relationships. A route describing a
+ smaller set of destinations (a longer prefix) is said to be more
+ specific than a route describing a larger set of destinations (a
+ shorted prefix); similarly, a route describing a larger set of
+ destinations (a shorter prefix) is said to be less specific than a
+ route describing a smaller set of destinations (a longer prefix).
+
+ The precedence relationship effectively decomposes less specific
+ routes into two parts:
+
+ - a set of destinations described only by the less specific
+ route, and
+
+ - a set of destinations described by the overlap of the less
+ specific and the more specific routes
+
+ When overlapping routes are present in the same Adj-RIB-In, the more
+ specific route shall take precedence, in order from more specific to
+ least specific.
+
+ The set of destinations described by the overlap represents a portion
+ of the less specific route that is feasible, but is not currently in
+ use. If a more specific route is later withdrawn, the set of
+ destinations described by the overlap will still be reachable using
+ the less specific route.
+
+
+
+
+Rekhter & Li [Page 39]
+
+RFC 1771 BGP-4 March 1995
+
+
+ If a BGP speaker receives overlapping routes, the Decision Process
+ shall take into account the semantics of the overlapping routes. In
+ particular, if a BGP speaker accepts the less specific route while
+ rejecting the more specific route from the same peer, then the
+ destinations represented by the overlap may not forward along the ASs
+ listed in the AS_PATH attribute of that route. Therefore, a BGP
+ speaker has the following choices:
+
+ a) Install both the less and the more specific routes
+
+ b) Install the more specific route only
+
+ c) Install the non-overlapping part of the less specific
+ route only (that implies de-aggregation)
+
+ d) Aggregate the two routes and install the aggregated route
+
+ e) Install the less specific route only
+
+ f) Install neither route
+
+ If a BGP speaker chooses e), then it should add ATOMIC_AGGREGATE
+ attribute to the route. A route that carries ATOMIC_AGGREGATE
+ attribute can not be de-aggregated. That is, the NLRI of this route
+ can not be made more specific. Forwarding along such a route does
+ not guarantee that IP packets will actually traverse only ASs listed
+ in the AS_PATH attribute of the route. If a BGP speaker chooses a),
+ it must not advertise the more general route without the more
+ specific route.
+
+9.2 Update-Send Process
+
+ The Update-Send process is responsible for advertising UPDATE
+ messages to all peers. For example, it distributes the routes chosen
+ by the Decision Process to other BGP speakers which may be located in
+ either the same autonomous system or a neighboring autonomous system.
+ rules for information exchange between BGP speakers located in
+ different autonomous systems are given in 9.2.2; rules for
+ information exchange between BGP speakers located in the same
+ autonomous system are given in 9.2.1.
+
+ Distribution of routing information between a set of BGP speakers,
+ all of which are located in the same autonomous system, is referred
+ to as internal distribution.
+
+
+
+
+
+
+
+Rekhter & Li [Page 40]
+
+RFC 1771 BGP-4 March 1995
+
+
+9.2.1 Internal Updates
+
+ The Internal update process is concerned with the distribution of
+ routing information to BGP speakers located in the local speaker's
+ autonomous system.
+
+ When a BGP speaker receives an UPDATE message from another BGP
+ speaker located in its own autonomous system, the receiving BGP
+ speaker shall not re-distribute the routing information contained in
+ that UPDATE message to other BGP speakers located in its own
+ autonomous system.
+
+ When a BGP speaker receives a new route from a BGP speaker in a
+ neighboring autonomous system, it shall advertise that route to all
+ other BGP speakers in its autonomous system by means of an UPDATE
+ message if any of the following conditions occur:
+
+ 1) the degree of preference assigned to the newly received route
+ by the local BGP speaker is higher than the degree of preference
+ that the local speaker has assigned to other routes that have been
+ received from BGP speakers in neighboring autonomous systems, or
+
+ 2) there are no other routes that have been received from BGP
+ speakers in neighboring autonomous systems, or
+
+ 3) the newly received route is selected as a result of breaking a
+ tie between several routes which have the highest degree of
+ preference, and the same destination (the tie-breaking procedure
+ is specified in 9.2.1.1).
+
+ When a BGP speaker receives an UPDATE message with a non-empty
+ WITHDRAWN ROUTES field, it shall remove from its Adj-RIB-In all
+ routes whose destinations was carried in this field (as IP prefixes).
+ The speaker shall take the following additional steps:
+
+ 1) if the corresponding feasible route had not been previously
+ advertised, then no further action is necessary
+
+ 2) if the corresponding feasible route had been previously
+ advertised, then:
+
+ i) if a new route is selected for advertisement that has the
+ same Network Layer Reachability Information as the unfeasible
+ routes, then the local BGP speaker shall advertise the
+ replacement route
+
+ ii) if a replacement route is not available for advertisement,
+ then the BGP speaker shall include the destinations of the
+
+
+
+Rekhter & Li [Page 41]
+
+RFC 1771 BGP-4 March 1995
+
+
+ unfeasible route (in form of IP prefixes) in the WITHDRAWN
+ ROUTES field of an UPDATE message, and shall send this message
+ to each peer to whom it had previously advertised the
+ corresponding feasible route.
+
+ All feasible routes which are advertised shall be placed in the
+ appropriate Adj-RIBs-Out, and all unfeasible routes which are
+ advertised shall be removed from the Adj-RIBs-Out.
+
+9.2.1.1 Breaking Ties (Internal Updates)
+
+ If a local BGP speaker has connections to several BGP speakers in
+ neighboring autonomous systems, there will be multiple Adj-RIBs-In
+ associated with these peers. These Adj-RIBs-In might contain several
+ equally preferable routes to the same destination, all of which were
+ advertised by BGP speakers located in neighboring autonomous systems.
+ The local BGP speaker shall select one of these routes according to
+ the following rules:
+
+ a) If the candidate route differ only in their NEXT_HOP and
+ MULTI_EXIT_DISC attributes, and the local system is configured to
+ take into account MULTI_EXIT_DISC attribute, select the routes
+ that has the lowest value of the MULTI_EXIT_DISC attribute.
+
+ b) If the local system can ascertain the cost of a path to the
+ entity depicted by the NEXT_HOP attribute of the candidate route,
+ select the route with the lowest cost.
+
+ c) In all other cases, select the route that was advertised by the
+ BGP speaker whose BGP Identifier has the lowest value.
+
+9.2.2 External Updates
+
+ The external update process is concerned with the distribution of
+ routing information to BGP speakers located in neighboring autonomous
+ systems. As part of Phase 3 route selection process, the BGP speaker
+ has updated its Adj-RIBs-Out and its Forwarding Table. All newly
+ installed routes and all newly unfeasible routes for which there is
+ no replacement route shall be advertised to BGP speakers located in
+ neighboring autonomous systems by means of UPDATE message.
+
+ Any routes in the Loc-RIB marked as unfeasible shall be removed.
+ Changes to the reachable destinations within its own autonomous
+ system shall also be advertised in an UPDATE message.
+
+
+
+
+
+
+
+Rekhter & Li [Page 42]
+
+RFC 1771 BGP-4 March 1995
+
+
+9.2.3 Controlling Routing Traffic Overhead
+
+ The BGP protocol constrains the amount of routing traffic (that is,
+ UPDATE messages) in order to limit both the link bandwidth needed to
+ advertise UPDATE messages and the processing power needed by the
+ Decision Process to digest the information contained in the UPDATE
+ messages.
+
+9.2.3.1 Frequency of Route Advertisement
+
+ The parameter MinRouteAdvertisementInterval determines the minimum
+ amount of time that must elapse between advertisement of routes to a
+ particular destination from a single BGP speaker. This rate limiting
+ procedure applies on a per-destination basis, although the value of
+ MinRouteAdvertisementInterval is set on a per BGP peer basis.
+
+ Two UPDATE messages sent from a single BGP speaker that advertise
+ feasible routes to some common set of destinations received from BGP
+ speakers in neighboring autonomous systems must be separated by at
+ least MinRouteAdvertisementInterval. Clearly, this can only be
+ achieved precisely by keeping a separate timer for each common set of
+ destinations. This would be unwarranted overhead. Any technique which
+ ensures that the interval between two UPDATE messages sent from a
+ single BGP speaker that advertise feasible routes to some common set
+ of destinations received from BGP speakers in neighboring autonomous
+ systems will be at least MinRouteAdvertisementInterval, and will also
+ ensure a constant upper bound on the interval is acceptable.
+
+ Since fast convergence is needed within an autonomous system, this
+ procedure does not apply for routes receives from other BGP speakers
+ in the same autonomous system. To avoid long-lived black holes, the
+ procedure does not apply to the explicit withdrawal of unfeasible
+ routes (that is, routes whose destinations (expressed as IP prefixes)
+ are listed in the WITHDRAWN ROUTES field of an UPDATE message).
+
+ This procedure does not limit the rate of route selection, but only
+ the rate of route advertisement. If new routes are selected multiple
+ times while awaiting the expiration of MinRouteAdvertisementInterval,
+ the last route selected shall be advertised at the end of
+ MinRouteAdvertisementInterval.
+
+9.2.3.2 Frequency of Route Origination
+
+ The parameter MinASOriginationInterval determines the minimum amount
+ of time that must elapse between successive advertisements of UPDATE
+ messages that report changes within the advertising BGP speaker's own
+ autonomous systems.
+
+
+
+
+Rekhter & Li [Page 43]
+
+RFC 1771 BGP-4 March 1995
+
+
+9.2.3.3 Jitter
+
+ To minimize the likelihood that the distribution of BGP messages by a
+ given BGP speaker will contain peaks, jitter should be applied to the
+ timers associated with MinASOriginationInterval, Keepalive, and
+ MinRouteAdvertisementInterval. A given BGP speaker shall apply the
+ same jitter to each of these quantities regardless of the
+ destinations to which the updates are being sent; that is, jitter
+ will not be applied on a "per peer" basis.
+
+ The amount of jitter to be introduced shall be determined by
+ multiplying the base value of the appropriate timer by a random
+ factor which is uniformly distributed in the range from 0.75 to 1.0.
+
+9.2.4 Efficient Organization of Routing Information
+
+ Having selected the routing information which it will advertise, a
+ BGP speaker may avail itself of several methods to organize this
+ information in an efficient manner.
+
+9.2.4.1 Information Reduction
+
+ Information reduction may imply a reduction in granularity of policy
+ control - after information is collapsed, the same policies will
+ apply to all destinations and paths in the equivalence class.
+
+ The Decision Process may optionally reduce the amount of information
+ that it will place in the Adj-RIBs-Out by any of the following
+ methods:
+
+ a) Network Layer Reachability Information (NLRI):
+
+ Destination IP addresses can be represented as IP address
+ prefixes. In cases where there is a correspondence between the
+ address structure and the systems under control of an autonomous
+ system administrator, it will be possible to reduce the size of
+ the NLRI carried in the UPDATE messages.
+
+ b) AS_PATHs:
+
+ AS path information can be represented as ordered AS_SEQUENCEs or
+ unordered AS_SETs. AS_SETs are used in the route aggregation
+ algorithm described in 9.2.4.2. They reduce the size of the
+ AS_PATH information by listing each AS number only once,
+ regardless of how many times it may have appeared in multiple
+ AS_PATHs that were aggregated.
+
+
+
+
+
+Rekhter & Li [Page 44]
+
+RFC 1771 BGP-4 March 1995
+
+
+ An AS_SET implies that the destinations listed in the NLRI can be
+ reached through paths that traverse at least some of the
+ constituent autonomous systems. AS_SETs provide sufficient
+ information to avoid routing information looping; however their
+ use may prune potentially feasible paths, since such paths are no
+ longer listed individually as in the form of AS_SEQUENCEs. In
+ practice this is not likely to be a problem, since once an IP
+ packet arrives at the edge of a group of autonomous systems, the
+ BGP speaker at that point is likely to have more detailed path
+ information and can distinguish individual paths to destinations.
+
+9.2.4.2 Aggregating Routing Information
+
+ Aggregation is the process of combining the characteristics of
+ several different routes in such a way that a single route can be
+ advertised. Aggregation can occur as part of the decision process
+ to reduce the amount of routing information that will be placed in
+ the Adj-RIBs-Out.
+
+ Aggregation reduces the amount of information that a BGP speaker must
+ store and exchange with other BGP speakers. Routes can be aggregated
+ by applying the following procedure separately to path attributes of
+ like type and to the Network Layer Reachability Information.
+
+ Routes that have the following attributes shall not be aggregated
+ unless the corresponding attributes of each route are identical:
+ MULTI_EXIT_DISC, NEXT_HOP.
+
+ Path attributes that have different type codes can not be aggregated
+ together. Path of the same type code may be aggregated, according to
+ the following rules:
+
+ ORIGIN attribute: If at least one route among routes that are
+ aggregated has ORIGIN with the value INCOMPLETE, then the
+ aggregated route must have the ORIGIN attribute with the value
+ INCOMPLETE. Otherwise, if at least one route among routes that are
+ aggregated has ORIGIN with the value EGP, then the aggregated
+ route must have the origin attribute with the value EGP. In all
+ other case the value of the ORIGIN attribute of the aggregated
+ route is INTERNAL.
+
+ AS_PATH attribute: If routes to be aggregated have identical
+ AS_PATH attributes, then the aggregated route has the same AS_PATH
+ attribute as each individual route.
+
+ For the purpose of aggregating AS_PATH attributes we model each AS
+ within the AS_PATH attribute as a tuple <type, value>, where
+ "type" identifies a type of the path segment the AS belongs to
+
+
+
+Rekhter & Li [Page 45]
+
+RFC 1771 BGP-4 March 1995
+
+
+ (e.g. AS_SEQUENCE, AS_SET), and "value" is the AS number. If the
+ routes to be aggregated have different AS_PATH attributes, then
+ the aggregated AS_PATH attribute shall satisfy all of the
+ following conditions:
+
+ - all tuples of the type AS_SEQUENCE in the aggregated AS_PATH
+ shall appear in all of the AS_PATH in the initial set of routes
+ to be aggregated.
+
+ - all tuples of the type AS_SET in the aggregated AS_PATH shall
+ appear in at least one of the AS_PATH in the initial set (they
+ may appear as either AS_SET or AS_SEQUENCE types).
+
+ - for any tuple X of the type AS_SEQUENCE in the aggregated
+ AS_PATH which precedes tuple Y in the aggregated AS_PATH, X
+ precedes Y in each AS_PATH in the initial set which contains Y,
+ regardless of the type of Y.
+
+ - No tuple with the same value shall appear more than once in
+ the aggregated AS_PATH, regardless of the tuple's type.
+
+ An implementation may choose any algorithm which conforms to these
+ rules. At a minimum a conformant implementation shall be able to
+ perform the following algorithm that meets all of the above
+ conditions:
+
+ - determine the longest leading sequence of tuples (as defined
+ above) common to all the AS_PATH attributes of the routes to be
+ aggregated. Make this sequence the leading sequence of the
+ aggregated AS_PATH attribute.
+
+ - set the type of the rest of the tuples from the AS_PATH
+ attributes of the routes to be aggregated to AS_SET, and append
+ them to the aggregated AS_PATH attribute.
+
+ - if the aggregated AS_PATH has more than one tuple with the
+ same value (regardless of tuple's type), eliminate all, but one
+ such tuple by deleting tuples of the type AS_SET from the
+ aggregated AS_PATH attribute.
+
+ Appendix 6, section 6.8 presents another algorithm that satisfies
+ the conditions and allows for more complex policy configurations.
+
+ ATOMIC_AGGREGATE: If at least one of the routes to be aggregated
+ has ATOMIC_AGGREGATE path attribute, then the aggregated route
+ shall have this attribute as well.
+
+
+
+
+
+Rekhter & Li [Page 46]
+
+RFC 1771 BGP-4 March 1995
+
+
+ AGGREGATOR: All AGGREGATOR attributes of all routes to be
+ aggregated should be ignored.
+
+9.3 Route Selection Criteria
+
+ Generally speaking, additional rules for comparing routes among
+ several alternatives are outside the scope of this document. There
+ are two exceptions:
+
+ - If the local AS appears in the AS path of the new route being
+ considered, then that new route cannot be viewed as better than
+ any other route. If such a route were ever used, a routing loop
+ would result.
+
+ - In order to achieve successful distributed operation, only
+ routes with a likelihood of stability can be chosen. Thus, an AS
+ must avoid using unstable routes, and it must not make rapid
+ spontaneous changes to its choice of route. Quantifying the terms
+ "unstable" and "rapid" in the previous sentence will require
+ experience, but the principle is clear.
+
+9.4 Originating BGP routes
+
+ A BGP speaker may originate BGP routes by injecting routing
+ information acquired by some other means (e.g. via an IGP) into BGP.
+ A BGP speaker that originates BGP routes shall assign the degree of
+ preference to these routes by passing them through the Decision
+ Process (see Section 9.1). These routes may also be distributed to
+ other BGP speakers within the local AS as part of the Internal update
+ process (see Section 9.2.1). The decision whether to distribute non-
+ BGP acquired routes within an AS via BGP or not depends on the
+ environment within the AS (e.g. type of IGP) and should be controlled
+ via configuration.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 47]
+
+RFC 1771 BGP-4 March 1995
+
+
+Appendix 1. BGP FSM State Transitions and Actions.
+
+ This Appendix discusses the transitions between states in the BGP FSM
+ in response to BGP events. The following is the list of these states
+ and events when the negotiated Hold Time value is non-zero.
+
+ BGP States:
+
+ 1 - Idle
+ 2 - Connect
+ 3 - Active
+ 4 - OpenSent
+ 5 - OpenConfirm
+ 6 - Established
+
+ BGP Events:
+
+ 1 - BGP Start
+ 2 - BGP Stop
+ 3 - BGP Transport connection open
+ 4 - BGP Transport connection closed
+ 5 - BGP Transport connection open failed
+ 6 - BGP Transport fatal error
+ 7 - ConnectRetry timer expired
+ 8 - Hold Timer expired
+ 9 - KeepAlive timer expired
+ 10 - Receive OPEN message
+ 11 - Receive KEEPALIVE message
+ 12 - Receive UPDATE messages
+ 13 - Receive NOTIFICATION message
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 48]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The following table describes the state transitions of the BGP FSM
+ and the actions triggered by these transitions.
+
+
+ Event Actions Message Sent Next State
+ --------------------------------------------------------------------
+ Idle (1)
+ 1 Initialize resources none 2
+ Start ConnectRetry timer
+ Initiate a transport connection
+ others none none 1
+
+ Connect(2)
+ 1 none none 2
+ 3 Complete initialization OPEN 4
+ Clear ConnectRetry timer
+ 5 Restart ConnectRetry timer none 3
+ 7 Restart ConnectRetry timer none 2
+ Initiate a transport connection
+ others Release resources none 1
+
+ Active (3)
+ 1 none none 3
+ 3 Complete initialization OPEN 4
+ Clear ConnectRetry timer
+ 5 Close connection 3
+ Restart ConnectRetry timer
+ 7 Restart ConnectRetry timer none 2
+ Initiate a transport connection
+ others Release resources none 1
+
+ OpenSent(4)
+ 1 none none 4
+ 4 Close transport connection none 3
+ Restart ConnectRetry timer
+ 6 Release resources none 1
+ 10 Process OPEN is OK KEEPALIVE 5
+ Process OPEN failed NOTIFICATION 1
+ others Close transport connection NOTIFICATION 1
+ Release resources
+
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 49]
+
+RFC 1771 BGP-4 March 1995
+
+
+ OpenConfirm (5)
+ 1 none none 5
+ 4 Release resources none 1
+ 6 Release resources none 1
+ 9 Restart KeepAlive timer KEEPALIVE 5
+ 11 Complete initialization none 6
+ Restart Hold Timer
+ 13 Close transport connection 1
+ Release resources
+ others Close transport connection NOTIFICATION 1
+ Release resources
+
+ Established (6)
+ 1 none none 6
+ 4 Release resources none 1
+ 6 Release resources none 1
+ 9 Restart KeepAlive timer KEEPALIVE 6
+ 11 Restart Hold Timer KEEPALIVE 6
+ 12 Process UPDATE is OK UPDATE 6
+ Process UPDATE failed NOTIFICATION 1
+ 13 Close transport connection 1
+ Release resources
+ others Close transport connection NOTIFICATION 1
+ Release resources
+ ---------------------------------------------------------------------
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 50]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The following is a condensed version of the above state transition
+ table.
+
+
+ Events| Idle | Connect | Active | OpenSent | OpenConfirm | Estab
+ | (1) | (2) | (3) | (4) | (5) | (6)
+ |--------------------------------------------------------------
+ 1 | 2 | 2 | 3 | 4 | 5 | 6
+ | | | | | |
+ 2 | 1 | 1 | 1 | 1 | 1 | 1
+ | | | | | |
+ 3 | 1 | 4 | 4 | 1 | 1 | 1
+ | | | | | |
+ 4 | 1 | 1 | 1 | 3 | 1 | 1
+ | | | | | |
+ 5 | 1 | 3 | 3 | 1 | 1 | 1
+ | | | | | |
+ 6 | 1 | 1 | 1 | 1 | 1 | 1
+ | | | | | |
+ 7 | 1 | 2 | 2 | 1 | 1 | 1
+ | | | | | |
+ 8 | 1 | 1 | 1 | 1 | 1 | 1
+ | | | | | |
+ 9 | 1 | 1 | 1 | 1 | 5 | 6
+ | | | | | |
+ 10 | 1 | 1 | 1 | 1 or 5 | 1 | 1
+ | | | | | |
+ 11 | 1 | 1 | 1 | 1 | 6 | 6
+ | | | | | |
+ 12 | 1 | 1 | 1 | 1 | 1 | 1 or 6
+ | | | | | |
+ 13 | 1 | 1 | 1 | 1 | 1 | 1
+ | | | | | |
+ ---------------------------------------------------------------
+
+
+Appendix 2. Comparison with RFC1267
+
+ BGP-4 is capable of operating in an environment where a set of
+ reachable destinations may be expressed via a single IP prefix. The
+ concept of network classes, or subnetting is foreign to BGP-4. To
+ accommodate these capabilities BGP-4 changes semantics and encoding
+ associated with the AS_PATH attribute. New text has been added to
+ define semantics associated with IP prefixes. These abilities allow
+ BGP-4 to support the proposed supernetting scheme [9].
+
+ To simplify configuration this version introduces a new attribute,
+ LOCAL_PREF, that facilitates route selection procedures.
+
+
+
+Rekhter & Li [Page 51]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The INTER_AS_METRIC attribute has been renamed to be MULTI_EXIT_DISC.
+ A new attribute, ATOMIC_AGGREGATE, has been introduced to insure that
+ certain aggregates are not de-aggregated. Another new attribute,
+ AGGREGATOR, can be added to aggregate routes in order to advertise
+ which AS and which BGP speaker within that AS caused the aggregation.
+
+ To insure that Hold Timers are symmetric, the Hold Time is now
+ negotiated on a per-connection basis. Hold Times of zero are now
+ supported.
+
+Appendix 3. Comparison with RFC 1163
+
+ All of the changes listed in Appendix 2, plus the following.
+
+ To detect and recover from BGP connection collision, a new field (BGP
+ Identifier) has been added to the OPEN message. New text (Section
+ 6.8) has been added to specify the procedure for detecting and
+ recovering from collision.
+
+ The new document no longer restricts the border router that is passed
+ in the NEXT_HOP path attribute to be part of the same Autonomous
+ System as the BGP Speaker.
+
+ New document optimizes and simplifies the exchange of the information
+ about previously reachable routes.
+
+Appendix 4. Comparison with RFC 1105
+
+ All of the changes listed in Appendices 2 and 3, plus the following.
+
+ Minor changes to the RFC1105 Finite State Machine were necessary to
+ accommodate the TCP user interface provided by 4.3 BSD.
+
+ The notion of Up/Down/Horizontal relations present in RFC1105 has
+ been removed from the protocol.
+
+ The changes in the message format from RFC1105 are as follows:
+
+ 1. The Hold Time field has been removed from the BGP header and
+ added to the OPEN message.
+
+ 2. The version field has been removed from the BGP header and
+ added to the OPEN message.
+
+ 3. The Link Type field has been removed from the OPEN message.
+
+ 4. The OPEN CONFIRM message has been eliminated and replaced with
+ implicit confirmation provided by the KEEPALIVE message.
+
+
+
+Rekhter & Li [Page 52]
+
+RFC 1771 BGP-4 March 1995
+
+
+ 5. The format of the UPDATE message has been changed
+ significantly. New fields were added to the UPDATE message to
+ support multiple path attributes.
+
+ 6. The Marker field has been expanded and its role broadened to
+ support authentication.
+
+ Note that quite often BGP, as specified in RFC 1105, is referred
+ to as BGP-1, BGP, as specified in RFC 1163, is referred to as
+ BGP-2, BGP, as specified in RFC1267 is referred to as BGP-3, and
+ BGP, as specified in this document is referred to as BGP-4.
+
+Appendix 5. TCP options that may be used with BGP
+
+ If a local system TCP user interface supports TCP PUSH function, then
+ each BGP message should be transmitted with PUSH flag set. Setting
+ PUSH flag forces BGP messages to be transmitted promptly to the
+ receiver.
+
+ If a local system TCP user interface supports setting precedence for
+ TCP connection, then the BGP transport connection should be opened
+ with precedence set to Internetwork Control (110) value (see also
+ [6]).
+
+Appendix 6. Implementation Recommendations
+
+ This section presents some implementation recommendations.
+
+6.1 Multiple Networks Per Message
+
+ The BGP protocol allows for multiple address prefixes with the same
+ AS path and next-hop gateway to be specified in one message. Making
+ use of this capability is highly recommended. With one address prefix
+ per message there is a substantial increase in overhead in the
+ receiver. Not only does the system overhead increase due to the
+ reception of multiple messages, but the overhead of scanning the
+ routing table for updates to BGP peers and other routing protocols
+ (and sending the associated messages) is incurred multiple times as
+ well. One method of building messages containing many address
+ prefixes per AS path and gateway from a routing table that is not
+ organized per AS path is to build many messages as the routing table
+ is scanned. As each address prefix is processed, a message for the
+ associated AS path and gateway is allocated, if it does not exist,
+ and the new address prefix is added to it. If such a message exists,
+ the new address prefix is just appended to it. If the message lacks
+ the space to hold the new address prefix, it is transmitted, a new
+ message is allocated, and the new address prefix is inserted into the
+ new message. When the entire routing table has been scanned, all
+
+
+
+Rekhter & Li [Page 53]
+
+RFC 1771 BGP-4 March 1995
+
+
+ allocated messages are sent and their resources released. Maximum
+ compression is achieved when all the destinations covered by the
+ address prefixes share a gateway and common path attributes, making
+ it possible to send many address prefixes in one 4096-byte message.
+
+ When peering with a BGP implementation that does not compress
+ multiple address prefixes into one message, it may be necessary to
+ take steps to reduce the overhead from the flood of data received
+ when a peer is acquired or a significant network topology change
+ occurs. One method of doing this is to limit the rate of updates.
+ This will eliminate the redundant scanning of the routing table to
+ provide flash updates for BGP peers and other routing protocols. A
+ disadvantage of this approach is that it increases the propagation
+ latency of routing information. By choosing a minimum flash update
+ interval that is not much greater than the time it takes to process
+ the multiple messages this latency should be minimized. A better
+ method would be to read all received messages before sending updates.
+
+6.2 Processing Messages on a Stream Protocol
+
+ BGP uses TCP as a transport mechanism. Due to the stream nature of
+ TCP, all the data for received messages does not necessarily arrive
+ at the same time. This can make it difficult to process the data as
+ messages, especially on systems such as BSD Unix where it is not
+ possible to determine how much data has been received but not yet
+ processed.
+
+ One method that can be used in this situation is to first try to read
+ just the message header. For the KEEPALIVE message type, this is a
+ complete message; for other message types, the header should first be
+ verified, in particular the total length. If all checks are
+ successful, the specified length, minus the size of the message
+ header is the amount of data left to read. An implementation that
+ would "hang" the routing information process while trying to read
+ from a peer could set up a message buffer (4096 bytes) per peer and
+ fill it with data as available until a complete message has been
+ received.
+
+6.3 Reducing route flapping
+
+ To avoid excessive route flapping a BGP speaker which needs to
+ withdraw a destination and send an update about a more specific or
+ less specific route shall combine them into the same UPDATE message.
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 54]
+
+RFC 1771 BGP-4 March 1995
+
+
+6.4 BGP Timers
+
+ BGP employs five timers: ConnectRetry, Hold Time, KeepAlive,
+ MinASOriginationInterval, and MinRouteAdvertisementInterval The
+ suggested value for the ConnectRetry timer is 120 seconds. The
+ suggested value for the Hold Time is 90 seconds. The suggested value
+ for the KeepAlive timer is 30 seconds. The suggested value for the
+ MinASOriginationInterval is 15 seconds. The suggested value for the
+ MinRouteAdvertisementInterval is 30 seconds.
+
+ An implementation of BGP MUST allow these timers to be configurable.
+
+6.5 Path attribute ordering
+
+ Implementations which combine update messages as described above in
+ 6.1 may prefer to see all path attributes presented in a known order.
+ This permits them to quickly identify sets of attributes from
+ different update messages which are semantically identical. To
+ facilitate this, it is a useful optimization to order the path
+ attributes according to type code. This optimization is entirely
+ optional.
+
+6.6 AS_SET sorting
+
+ Another useful optimization that can be done to simplify this
+ situation is to sort the AS numbers found in an AS_SET. This
+ optimization is entirely optional.
+
+6.7 Control over version negotiation
+
+ Since BGP-4 is capable of carrying aggregated routes which cannot be
+ properly represented in BGP-3, an implementation which supports BGP-4
+ and another BGP version should provide the capability to only speak
+ BGP-4 on a per-peer basis.
+
+6.8 Complex AS_PATH aggregation
+
+ An implementation which chooses to provide a path aggregation
+ algorithm which retains significant amounts of path information may
+ wish to use the following procedure:
+
+ For the purpose of aggregating AS_PATH attributes of two routes,
+ we model each AS as a tuple <type, value>, where "type" identifies
+ a type of the path segment the AS belongs to (e.g. AS_SEQUENCE,
+ AS_SET), and "value" is the AS number. Two ASs are said to be the
+ same if their corresponding <type, value> tuples are the same.
+
+
+
+
+
+Rekhter & Li [Page 55]
+
+RFC 1771 BGP-4 March 1995
+
+
+ The algorithm to aggregate two AS_PATH attributes works as
+ follows:
+
+ a) Identify the same ASs (as defined above) within each AS_PATH
+ attribute that are in the same relative order within both
+ AS_PATH attributes. Two ASs, X and Y, are said to be in the
+ same order if either:
+
+ - X precedes Y in both AS_PATH attributes, or - Y precedes X
+ in both AS_PATH attributes.
+
+ b) The aggregated AS_PATH attribute consists of ASs identified
+ in (a) in exactly the same order as they appear in the AS_PATH
+ attributes to be aggregated. If two consecutive ASs identified
+ in (a) do not immediately follow each other in both of the
+ AS_PATH attributes to be aggregated, then the intervening ASs
+ (ASs that are between the two consecutive ASs that are the
+ same) in both attributes are combined into an AS_SET path
+ segment that consists of the intervening ASs from both AS_PATH
+ attributes; this segment is then placed in between the two
+ consecutive ASs identified in (a) of the aggregated attribute.
+ If two consecutive ASs identified in (a) immediately follow
+ each other in one attribute, but do not follow in another, then
+ the intervening ASs of the latter are combined into an AS_SET
+ path segment; this segment is then placed in between the two
+ consecutive ASs identified in (a) of the aggregated attribute.
+
+ If as a result of the above procedure a given AS number appears
+ more than once within the aggregated AS_PATH attribute, all, but
+ the last instance (rightmost occurrence) of that AS number should
+ be removed from the aggregated AS_PATH attribute.
+
+References
+
+ [1] Mills, D., "Exterior Gateway Protocol Formal Specification", RFC
+ 904, BBN, April 1984.
+
+ [2] Rekhter, Y., "EGP and Policy Based Routing in the New NSFNET
+ Backbone", RFC 1092, T.J. Watson Research Center, February 1989.
+
+ [3] Braun, H-W., "The NSFNET Routing Architecture", RFC 1093,
+ MERIT/NSFNET Project, February 1989.
+
+ [4] Postel, J., "Transmission Control Protocol - DARPA Internet
+ Program Protocol Specification", STD 7, RFC 793, DARPA, September
+ 1981.
+
+
+
+
+
+Rekhter & Li [Page 56]
+
+RFC 1771 BGP-4 March 1995
+
+
+ [5] Rekhter, Y., and P. Gross, "Application of the Border Gateway
+ Protocol in the Internet", RFC 1772, T.J. Watson Research Center,
+ IBM Corp., MCI, March 1995.
+
+ [6] Postel, J., "Internet Protocol - DARPA Internet Program Protocol
+ Specification", STD 5, RFC 791, DARPA, September 1981.
+
+ [7] "Information Processing Systems - Telecommunications and
+ Information Exchange between Systems - Protocol for Exchange of
+ Inter-domain Routeing Information among Intermediate Systems to
+ Support Forwarding of ISO 8473 PDUs", ISO/IEC IS10747, 1993
+
+ [8] Fuller, V., Li, T., Yu, J., and K. Varadhan, "Classless Inter-
+ Domain Routing (CIDR): an Address Assignment and Aggregation
+ Strategy", RFC 1519, BARRNet, cisco, MERIT, OARnet, September
+ 1993
+
+ [9] Rekhter, Y., Li, T., "An Architecture for IP Address Allocation
+ with CIDR", RFC 1518, T.J. Watson Research Center, cisco,
+ September 1993
+
+Security Considerations
+
+ Security issues are not discussed in this document.
+
+Editors' Addresses
+
+ Yakov Rekhter
+ T.J. Watson Research Center IBM Corporation
+ P.O. Box 704, Office H3-D40
+ Yorktown Heights, NY 10598
+
+ Phone: +1 914 784 7361
+ EMail: yakov@watson.ibm.com
+
+
+ Tony Li
+ cisco Systems, Inc.
+ 170 W. Tasman Dr.
+ San Jose, CA 95134
+
+ EMail: tli@cisco.com
+
+
+
+
+
+
+
+
+
+Rekhter & Li [Page 57]
+
diff --git a/doc/rfc4360.txt b/doc/rfc4360.txt
@@ -0,0 +1,675 @@
+
+
+
+
+
+
+Network Working Group S. Sangli
+Request for Comments: 4360 D. Tappan
+Category: Standards Track Cisco Systems
+ Y. Rekhter
+ Juniper Networks
+ February 2006
+
+
+ BGP Extended Communities Attribute
+
+Status of This Memo
+
+ This document specifies an Internet standards track protocol for the
+ Internet community, and requests discussion and suggestions for
+ improvements. Please refer to the current edition of the "Internet
+ Official Protocol Standards" (STD 1) for the standardization state
+ and status of this protocol. Distribution of this memo is unlimited.
+
+Copyright Notice
+
+ Copyright (C) The Internet Society (2006).
+
+Abstract
+
+ This document describes the "extended community" BGP-4 attribute.
+ This attribute provides a mechanism for labeling information carried
+ in BGP-4. These labels can be used to control the distribution of
+ this information, or for other applications.
+
+1. Introduction
+
+ The Extended Community Attribute provides a mechanism for labeling
+ information carried in BGP-4 [BGP-4]. It provides two important
+ enhancements over the existing BGP Community Attribute [RFC1997]:
+
+ - An extended range, ensuring that communities can be assigned for
+ a plethora of uses, without fear of overlap.
+
+ - The addition of a Type field provides structure for the
+ community space.
+
+ The addition of structure allows the usage of policy based on the
+ application for which the community value will be used. For example,
+ one can filter out all communities of a particular type, or allow
+ only certain values for a particular type of community. It also
+ allows one to specify whether a particular community is transitive or
+ non-transitive across an Autonomous System (AS) boundary. Without
+ structure, this can only be accomplished by explicitly enumerating
+
+
+
+Sangli, et al. Standards Track [Page 1]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+ all community values that will be denied or allowed and passed to BGP
+ speakers in neighboring ASes based on the transitive property.
+
+1.1. Specification of Requirements
+
+ The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT",
+ "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this
+ document are to be interpreted as described in RFC 2119 [RFC2119].
+
+2. BGP Extended Communities Attribute
+
+ The Extended Communities Attribute is a transitive optional BGP
+ attribute, with the Type Code 16. The attribute consists of a set of
+ "extended communities". All routes with the Extended Communities
+ attribute belong to the communities listed in the attribute.
+
+ Each Extended Community is encoded as an 8-octet quantity, as
+ follows:
+
+ - Type Field : 1 or 2 octets
+ - Value Field : Remaining octets
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Type high | Type low(*) | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ Value |
+ | |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ (*) Present for Extended types only, used for the Value field
+ otherwise.
+
+ Type Field:
+
+ Two classes of Type Field are introduced: Regular type and
+ Extended type.
+
+ The size of Type Field for Regular types is 1 octet, and the
+ size of the Type Field for Extended types is 2 octets.
+
+ The value of the high-order octet of the Type Field determines
+ if an extended community is a Regular type or an Extended type.
+ The class of a type (Regular or Extended) is not encoded in the
+ structure of the type itself. The class of a type is specified
+ in the document that defines the type and the IANA registry.
+
+
+
+
+
+Sangli, et al. Standards Track [Page 2]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+ The high-order octet of the Type Field is as shown below:
+
+ 0 1 2 3 4 5 6 7
+ +-+-+-+-+-+-+-+-+
+ |I|T| |
+ +-+-+-+-+-+-+-+-+
+
+ I - IANA authority bit
+
+ Value 0: IANA-assignable type using the "First Come First
+ Serve" policy
+
+ Value 1: Part of this Type Field space is for IANA
+ assignable types using either the Standard Action or the
+ Early IANA Allocation policy. The rest of this Type
+ Field space is for Experimental use.
+
+ T - Transitive bit
+
+ Value 0: The community is transitive across ASes
+
+ Value 1: The community is non-transitive across ASes
+
+ Remaining 6 bits: Indicates the structure of the community
+
+ Value Field:
+
+ The encoding of the Value Field is dependent on the "type" of
+ the community as specified by the Type Field.
+
+ Two extended communities are declared equal only when all 8 octets of
+ the community are equal.
+
+ The two members in the tuple <Type, Value> should be enumerated to
+ specify any community value. The remaining octets of the community
+ interpreted based on the value of the Type field.
+
+3. Defined BGP Extended Community Types
+
+ This section introduces a few extended types and defines the format
+ of the Value Field for those types. The types introduced here
+ provide "templates", where each template is identified by the high-
+ order octet of the extended community Type field, and the lower-order
+ octet (sub-type) is used to indicate a particular type of extended
+ community.
+
+
+
+
+
+
+Sangli, et al. Standards Track [Page 3]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+3.1. Two-Octet AS Specific Extended Community
+
+ This is an extended type with Type Field composed of 2 octets and
+ Value Field composed of 6 octets.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | 0x00 or 0x40 | Sub-Type | Global Administrator |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Local Administrator |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The value of the high-order octet of this extended type is either
+ 0x00 or 0x40. The low-order octet of this extended type is used to
+ indicate sub-types.
+
+ The Value Field consists of two sub-fields:
+
+ Global Administrator sub-field: 2 octets
+
+ This sub-field contains an Autonomous System number assigned by
+ IANA.
+
+ Local Administrator sub-field: 4 octets
+
+ The organization identified by Autonomous System number in the
+ Global Administrator sub-field can encode any information in
+ this sub-field. The format and meaning of the value encoded in
+ this sub-field should be defined by the sub-type of the
+ community.
+
+3.2. IPv4 Address Specific Extended Community
+
+ This is an extended type with Type Field composed of 2 octets and
+ Value Field composed of 6 octets.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | 0x01 or 0x41 | Sub-Type | Global Administrator |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Global Administrator (cont.) | Local Administrator |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The value of the high-order octet of this extended type is either
+ 0x01 or 0x41. The low-order octet of this extended type is used to
+ indicate sub-types.
+
+
+
+Sangli, et al. Standards Track [Page 4]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+ The Value field consists of two sub-fields:
+
+ Global Administrator sub-field: 4 octets
+
+ This sub-field contains an IPv4 unicast address assigned by one
+ of the Internet registries.
+
+ Local Administrator sub-field: 2 octets
+
+ The organization that has been assigned the IPv4 address in the
+ Global Administrator sub-field can encode any information in
+ this sub-field. The format and meaning of this value encoded
+ in this sub-field should be defined by the sub-type of the
+ community.
+
+3.3. Opaque Extended Community
+
+ This is an extended type with Type Field composed of 2 octets and
+ Value Field composed of 6 octets.
+
+ 0 1 2 3
+ 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | 0x03 or 0x43 | Sub-Type | Value |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ | Value (cont.) |
+ +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+
+ The value of the high-order octet of this extended type is either
+ 0x03 or 0x43. The low-order octet of this extended type is used to
+ indicate sub-types.
+
+ This is a generic community of extended type. The value of the sub-
+ type that should define the Value Field is to be assigned by IANA.
+
+4. Route Target Community
+
+ The Route Target Community identifies one or more routers that may
+ receive a set of routes (that carry this Community) carried by BGP.
+ This is transitive across the Autonomous System boundary.
+
+ The Route Target Community is of an extended type.
+
+ The value of the high-order octet of the Type field for the Route
+ Target Community can be 0x00, 0x01, or 0x02. The value of the low-
+ order octet of the Type field for this community is 0x02.
+
+
+
+
+
+Sangli, et al. Standards Track [Page 5]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+ When the value of the high-order octet of the Type field is 0x00 or
+ 0x02, the Local Administrator sub-field contains a number from a
+ numbering space that is administered by the organization to which the
+ Autonomous System number carried in the Global Administrator sub-
+ field has been assigned by an appropriate authority.
+
+ When the value of the high-order octet of the Type field is 0x01, the
+ Local Administrator sub-field contains a number from a numbering
+ space that is administered by the organization to which the IP
+ address carried in the Global Administrator sub-field has been
+ assigned by an appropriate authority.
+
+ One possible use of the Route Target Community is specified in
+ [RFC4364].
+
+5. Route Origin Community
+
+ The Route Origin Community identifies one or more routers that inject
+ a set of routes (that carry this Community) into BGP. This is
+ transitive across the Autonomous System boundary.
+
+ The Route Origin Community is of an extended type.
+
+ The value of the high-order octet of the Type field for the Route
+ Origin Community can be 0x00, 0x01, or 0x02. The value of the low-
+ order octet of the Type field for this community is 0x03.
+
+ When the value of the high-order octet of the Type field is 0x00 or
+ 0x02, the Local Administrator sub-field contains a number from a
+ numbering space that is administered by the organization to which the
+ Autonomous System number carried in the Global Administrator sub-
+ field has been assigned by an appropriate authority.
+
+ When the value of the high-order octet of the Type field is 0x01, the
+ Local Administrator sub-field contains a number from a numbering
+ space that is administered by the organization to which the IP
+ address carried in the Global Administrator sub-field has been
+ assigned by an appropriate authority.
+
+ One possible use of the Route Origin Community is specified in
+ [RFC4364].
+
+
+
+
+
+
+
+
+
+
+Sangli, et al. Standards Track [Page 6]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+6. Operations
+
+ A BGP speaker may use the Extended Communities attribute to control
+ which routing information it accepts or distributes to its peers.
+
+ The Extended Community attribute MUST NOT be used to modify the BGP
+ best path selection algorithm in a way that leads to forwarding
+ loops.
+
+ A BGP speaker receiving a route that doesn't have the Extended
+ Communities attribute MAY append this attribute to the route when
+ propagating it to its peers.
+
+ A BGP speaker receiving a route with the Extended Communities
+ attribute MAY modify this attribute according to the local policy.
+
+ By default if a range of routes is to be aggregated and the resultant
+ aggregates path attributes do not carry the ATOMIC_AGGREGATE
+ attribute, then the resulting aggregate should have an Extended
+ Communities path attribute that contains the set union of all the
+ Extended Communities from all of the aggregated routes. The default
+ behavior could be overridden via local configuration, in which case
+ handling the Extended Communities attribute in the presence of route
+ aggregation becomes a matter of the local policy of the BGP speaker
+ that performs the aggregation.
+
+ If a route has a non-transitivity extended community, then before
+ advertising the route across the Autonomous System boundary the
+ community SHOULD be removed from the route. However, the community
+ SHOULD NOT be removed when advertising the route across the BGP
+ Confederation boundary.
+
+ A route may carry both the BGP Communities attribute, as defined in
+ [RFC1997]), and the Extended BGP Communities attribute. In this
+ case, the BGP Communities attribute is handled as specified in
+ [RFC1997], and the Extended BGP Communities attribute is handled as
+ specified in this document.
+
+7. IANA Considerations
+
+ All the BGP Extended Communities contain a Type field. The IANA has
+ created a registry entitled, "BGP Extended Communities Type". The
+ IANA will maintain this registry.
+
+ The Type could be either regular or extended. For a regular Type the
+ IANA allocates an 8-bit value; for an extended Type the IANA
+ allocates a 16-bit value.
+
+
+
+
+Sangli, et al. Standards Track [Page 7]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+ The value allocated for a regular Type MUST NOT be reused as the
+ value of the high-order octet when allocating an extended Type. The
+ value of the high-order octet allocated for an extended Type MUST NOT
+ be reused when allocating a regular Type.
+
+ The Type field indicates where the Extended Community is transitive
+ or not. Future requests for assignment of a Type value must specify
+ whether the Type value is intended for a transitive or a non-
+ transitive Extended Community.
+
+ Future assignment are to be made using either the Standards Action
+ process defined in [RFC2434], the Early IANA Allocation process
+ defined in [RFC4020], or the "First Come First Served" policy defined
+ in [RFC2434].
+
+ The following table summarizes the ranges for the assignment of
+ Types:
+
+ Type Standard Action First Come
+ Early IANA Allocation First Served
+ ------------------ --------------------- ------------
+
+ regular, transitive 0x90-0xbf 0x00-x3f
+
+ regular, non-transitive 0xd0-0xff 0x40-0x7f
+
+ extended, transitive 0x9000-0xbfff 0x0000-0x3fff
+
+ extended, non-transitive 0xd000-0xffff 0x4000-0x7fff
+
+ Assignments consist of a name and the value.
+
+ The Type values 0x80-0x8f and 0xc0-0xcf for regular Types, and
+ 0x8000-0x8fff and 0xc000-0xcfff for extended Types are for
+ Experimental use as defined in RFC 3692.
+
+ This document defines a class of extended communities called two-
+ octet AS specific extended community for which the IANA is to create
+ and maintain a registry entitled "Two-octet AS Specific Extended
+ Community". All the communities in this class are of extended Types.
+ Future assignment are to be made using the "First Come First Served"
+ policy defined in [RFC2434]. The Type values for the transitive
+ communities of the two-octet AS specific extended community class are
+ 0x0000-0x00ff, and for the non-transitive communities of that class
+ are 0x4000-0x40ff. Assignments consist of a name and the value.
+
+ This document makes the following assignments for the two-octet AS
+ specific extended community:
+
+
+
+Sangli, et al. Standards Track [Page 8]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+ Name Type Value
+ ---- ----------
+ two-octet AS specific Route Target 0x0002
+ two-octet AS specific Route Origin 0x0003
+
+ This document defines a class of extended communities called IPv4
+ address specific extended community for which the IANA is to create
+ and maintain a registry entitled "IPv4 Address Specific Extended
+ Community". All the communities in this class are of extended Types.
+ Future assignment are to be made using the "First Come First Served"
+ policy defined in [RFC2434]. The Type values for the transitive
+ communities of the two-octet AS specific extended community class
+ are 0x0100-0x01ff, and for the non-transitive communities of that
+ class are 0x4100-0x41ff. Assignments consist of a name and the
+ value.
+
+ This document makes the following assignments for the IPv4 address
+ specific extended community:
+
+ Name Type Value
+ ---- ----------
+ IPv4 address specific Route Target 0x0102
+ IPv4 address specific Route Origin 0x0103
+
+ This document defines a class of extended communities called opaque
+ extended community for which the IANA is to create and maintain a
+ registry entitled "Opaque Extended Community". All the communities
+ in this class are of extended Types. Future assignment are to be
+ made using the "First Come First Served" policy defined in [RFC2434].
+ The Type values for the transitive communities of the opaque extended
+ community class are 0x0300-0x03ff, and for the non-transitive
+ communities of that class are 0x4300-0x43ff. Assignments consist of
+ a name and the value.
+
+ When requesting an allocation from more than one registry defined
+ above, one may ask for allocating the same Type value from these
+ registries. If possible, the IANA should accommodate such requests.
+
+8. Security Considerations
+
+ This extension to BGP has similar security implications as BGP
+ Communities [RFC1997].
+
+ This extension to BGP does not change the underlying security issues.
+ Specifically, an operator who is relying on the information carried
+ in BGP must have a transitive trust relationship back to the source
+ of the information. Specifying the mechanism(s) to provide such a
+ relationship is beyond the scope of this document.
+
+
+
+Sangli, et al. Standards Track [Page 9]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+9. Acknowledgements
+
+ The authors would like to thank John Hawkinson, Jeffrey Haas, Bruno
+ Rijsman, Bill Fenner, and Alex Zinin for their suggestions and
+ feedback.
+
+10. Normative References
+
+ [BGP-4] Rekhter, Y. and T. Li, "A Border Gateway Protocol 4
+ (BGP-4)", RFC 4271, January 2006.
+
+ [RFC1997] Chandra, R., Traina, P., and T. Li, "BGP Communities
+ Attribute", RFC 1997, August 1996.
+
+ [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate
+ Requirement Levels", BCP 14, RFC 2119, March 1997.
+
+ [RFC2434] Narten, T. and H. Alvestrand, "Guidelines for Writing
+ an IANA Considerations Section in RFCs", BCP 26, RFC
+ 2434, October 1998.
+
+ [RFC4020] Kompella, K. and A. Zinin, "Early IANA Allocation of
+ Standards Track Code Points", BCP 100, RFC 4020,
+ February 2005.
+
+11. Informative References
+
+ [RFC4364] Rosen, E. and Y. Rekhter, "BGP/MPLS IP Virtual Private
+ Networks (VPNs)", RFC 4364, February 2006.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Sangli, et al. Standards Track [Page 10]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+Authors' Addresses
+
+ Srihari R. Sangli
+ Cisco Systems, Inc.
+
+ EMail: rsrihari@cisco.com
+
+
+ Dan Tappan
+ Cisco Systems, Inc.
+ 250 Apollo Drive
+ Chelmsford, MA 01824
+
+ EMail: tappan@cisco.com
+
+
+ Yakov Rekhter
+ Juniper Networks, Inc.
+ 1194 N. Mathilda Ave
+ Sunnyvale, CA 94089
+
+ EMail: yakov@juniper.net
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Sangli, et al. Standards Track [Page 11]
+
+RFC 4360 BGP Extended Communities Attribute February 2006
+
+
+Full Copyright Statement
+
+ Copyright (C) The Internet Society (2006).
+
+ This document is subject to the rights, licenses and restrictions
+ contained in BCP 78, and except as set forth therein, the authors
+ retain all their rights.
+
+ This document and the information contained herein are provided on an
+ "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE REPRESENTS
+ OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE INTERNET
+ ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED,
+ INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE
+ INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED
+ WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+Intellectual Property
+
+ The IETF takes no position regarding the validity or scope of any
+ Intellectual Property Rights or other rights that might be claimed to
+ pertain to the implementation or use of the technology described in
+ this document or the extent to which any license under such rights
+ might or might not be available; nor does it represent that it has
+ made any independent effort to identify any such rights. Information
+ on the procedures with respect to rights in RFC documents can be
+ found in BCP 78 and BCP 79.
+
+ Copies of IPR disclosures made to the IETF Secretariat and any
+ assurances of licenses to be made available, or the result of an
+ attempt made to obtain a general license or permission for the use of
+ such proprietary rights by implementers or users of this
+ specification can be obtained from the IETF on-line IPR repository at
+ http://www.ietf.org/ipr.
+
+ The IETF invites any interested party to bring to its attention any
+ copyrights, patents or patent applications, or other proprietary
+ rights that may cover technology that may be required to implement
+ this standard. Please address the information to the IETF at
+ ietf-ipr@ietf.org.
+
+Acknowledgement
+
+ Funding for the RFC Editor function is provided by the IETF
+ Administrative Support Activity (IASA).
+
+
+
+
+
+
+
+Sangli, et al. Standards Track [Page 12]
+
diff --git a/doc/rfc4364.txt b/doc/rfc4364.txt
@@ -0,0 +1,2635 @@
+
+
+
+
+
+
+Network Working Group E. Rosen
+Request for Comments: 4364 Cisco Systems, Inc.
+Obsoletes: 2547 Y. Rekhter
+Category: Standards Track Juniper Networks, Inc.
+ February 2006
+
+
+ BGP/MPLS IP Virtual Private Networks (VPNs)
+
+Status of This Memo
+
+ This document specifies an Internet standards track protocol for the
+ Internet community, and requests discussion and suggestions for
+ improvements. Please refer to the current edition of the "Internet
+ Official Protocol Standards" (STD 1) for the standardization state
+ and status of this protocol. Distribution of this memo is unlimited.
+
+Copyright Notice
+
+ Copyright (C) The Internet Society (2006).
+
+Abstract
+
+ This document describes a method by which a Service Provider may use
+ an IP backbone to provide IP Virtual Private Networks (VPNs) for its
+ customers. This method uses a "peer model", in which the customers'
+ edge routers (CE routers) send their routes to the Service Provider's
+ edge routers (PE routers); there is no "overlay" visible to the
+ customer's routing algorithm, and CE routers at different sites do
+ not peer with each other. Data packets are tunneled through the
+ backbone, so that the core routers do not need to know the VPN
+ routes.
+
+ This document obsoletes RFC 2547.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 1]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+Table of Contents
+
+ 1. Introduction ....................................................3
+ 1.1. Virtual Private Networks ...................................4
+ 1.2. Customer Edge and Provider Edge ............................5
+ 1.3. VPNs with Overlapping Address Spaces .......................6
+ 1.4. VPNs with Different Routes to the Same System ..............7
+ 1.5. SP Backbone Routers ........................................7
+ 1.6. Security ...................................................8
+ 2. Sites and CEs ...................................................8
+ 3. VRFs: Multiple Forwarding Tables in PEs .........................9
+ 3.1. VRFs and Attachment Circuits ...............................9
+ 3.2. Associating IP Packets with VRFs ..........................10
+ 3.3. Populating the VRFs .......................................11
+ 4. VPN Route Distribution via BGP .................................12
+ 4.1. The VPN-IPv4 Address Family ...............................13
+ 4.2. Encoding of Route Distinguishers ..........................14
+ 4.3. Controlling Route Distribution ............................15
+ 4.3.1. The Route Target Attribute .........................15
+ 4.3.2. Route Distribution Among PEs by BGP ................17
+ 4.3.3. Use of Route Reflectors ............................20
+ 4.3.4. How VPN-IPv4 NLRI Is Carried in BGP ................22
+ 4.3.5. Building VPNs Using Route Targets ..................23
+ 4.3.6. Route Distribution Among VRFs in a Single PE .......23
+ 5. Forwarding .....................................................23
+ 6. Maintaining Proper Isolation of VPNs ...........................26
+ 7. How PEs Learn Routes from CEs ..................................27
+ 8. How CEs Learn Routes from PEs ..................................30
+ 9. Carriers' Carriers .............................................30
+ 10. Multi-AS Backbones ............................................32
+ 11. Accessing the Internet from a VPN .............................34
+ 12. Management VPNs ...............................................36
+ 13. Security Considerations .......................................37
+ 13.1. Data Plane ...............................................37
+ 13.2. Control Plane ............................................39
+ 13.3. Security of P and PE Devices .............................39
+ 14. Quality of Service ............................................39
+ 15. Scalability ...................................................40
+ 16. IANA Considerations ...........................................40
+ 17. Acknowledgements ..............................................41
+ 18. Contributors ..................................................41
+ 19. Normative References ..........................................44
+ 20. Informative References ........................................45
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 2]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+1. Introduction
+
+ This document describes a method by which a Service Provider may use
+ an IP backbone to provide IP Virtual Private Networks (VPNs) for its
+ customers. This method uses a "peer model", in which the customers'
+ edge routers (CE routers) send their routes to the Service Provider's
+ edge routers (PE routers). Border Gateway Protocol (BGP)
+ [BGP, BGP-MP] is then used by the Service Provider to exchange the
+ routes of a particular VPN among the PE routers that are attached to
+ that VPN. This is done in a way that ensures that routes from
+ different VPNs remain distinct and separate, even if two VPNs have an
+ overlapping address space. The PE routers distribute, to the CE
+ routers in a particular VPN, the routes from other the CE routers in
+ that VPN. The CE routers do not peer with each other, hence there is
+ no "overlay" visible to the VPN's routing algorithm. The term "IP"
+ in "IP VPN" is used to indicate that the PE receives IP datagrams
+ from the CE, examines their IP headers, and routes them accordingly.
+
+ Each route within a VPN is assigned a Multiprotocol Label Switching
+ (MPLS) [MPLS-ARCH, MPLS-BGP, MPLS-ENCAPS] label; when BGP distributes
+ a VPN route, it also distributes an MPLS label for that route.
+ Before a customer data packet travels across the Service Provider's
+ backbone, it is encapsulated with the MPLS label that corresponds, in
+ the customer's VPN, to the route that is the best match to the
+ packet's destination address. This MPLS packet is further
+ encapsulated (e.g., with another MPLS label or with an IP or Generic
+ Routing Encapsulation (GRE) tunnel header [MPLS-in-IP-GRE]) so that
+ it gets tunneled across the backbone to the proper PE router. Thus,
+ the backbone core routers do not need to know the VPN routes.
+
+ The primary goal of this method is to support the case in which a
+ client obtains IP backbone services from a Service Provider or
+ Service Providers with which it maintains contractual relationships.
+ The client may be an enterprise, a group of enterprises that need an
+ extranet, an Internet Service Provider, an application service
+ provider, another VPN Service Provider that uses this same method to
+ offer VPNs to clients of its own, etc. The method makes it very
+ simple for the client to use the backbone services. It is also very
+ scalable and flexible for the Service Provider, and allows the
+ Service Provider to add value.
+
+
+
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 3]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+1.1. Virtual Private Networks
+
+ Consider a set of "sites" that are attached to a common network that
+ we call "the backbone". Now apply some policy to create a number of
+ subsets of that set, and impose the following rule: two sites may
+ have IP interconnectivity over that backbone only if at least one of
+ these subsets contains them both.
+
+ These subsets are Virtual Private Networks (VPNs). Two sites have IP
+ connectivity over the common backbone only if there is some VPN that
+ contains them both. Two sites that have no VPN in common have no
+ connectivity over that backbone.
+
+ If all the sites in a VPN are owned by the same enterprise, the VPN
+ may be thought of as a corporate "intranet". If the various sites in
+ a VPN are owned by different enterprises, the VPN may be thought of
+ as an "extranet". A site can be in more than one VPN; e.g., in an
+ intranet and in several extranets. In general, when we use the term
+ "VPN" we will not be distinguishing between intranets and extranets.
+
+ We refer to the owners of the sites as the "customers". We refer to
+ the owners/operators of the backbone as the "Service Providers"
+ (SPs). The customers obtain "VPN service" from the SPs.
+
+ A customer may be a single enterprise, a set of enterprises, an
+ Internet Service Provider, an Application Service Provider, another
+ SP that offers the same kind of VPN service to its own customers,
+ etc.
+
+ The policies that determine whether a particular collection of sites
+ is a VPN are the policies of the customers. Some customers will want
+ the implementation of these policies to be entirely the
+ responsibility of the SP. Other customers may want to share with the
+ SP the responsibility for implementing these policies. This document
+ specifies mechanisms that can be used to implement these policies.
+ The mechanisms we describe are general enough to allow these policies
+ to be implemented either by the SP alone or by a VPN customer
+ together with the SP. Most of the discussion is focused on the
+ former case, however.
+
+ The mechanisms discussed in this document allow the implementation of
+ a wide range of policies. For example, within a given VPN, one can
+ allow every site to have a direct route to every other site ("full
+ mesh"). Alternatively, one can force traffic between certain pairs
+ of sites to be routed via a third site. This can be useful, e.g., if
+ it is desired that traffic between a pair of sites be passed through
+ a firewall, and the firewall is located at the third site.
+
+
+
+
+Rosen & Rekhter Standards Track [Page 4]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ In this document, we restrict our discussion to the case in which the
+ customer is explicitly purchasing VPN service from an SP, or from a
+ set of SPs that have agreed to cooperate to provide the VPN service.
+ That is, the customer is not merely purchasing internet access from
+ an SP, and the VPN traffic does not pass through a random collection
+ of interconnected SP networks.
+
+ We also restrict our discussion to the case in which the backbone
+ provides an IP service to the customer, rather than, e.g., a layer 2
+ service such as Frame Relay, Asynchronous Transfer Mode (ATM),
+ ethernet, High Level Data Link Control (HDLC), or Point-to-Point
+ Protocol (PPP). The customer may attach to the backbone via one of
+ these (or other) layer 2 services, but the layer 2 service is
+ terminated at the "edge" of the backbone, where the customer's IP
+ datagrams are removed from any layer 2 encapsulation.
+
+ In the rest of this introduction, we specify some properties that
+ VPNs should have. The remainder of this document specifies a set of
+ mechanisms that can be deployed to provide a VPN model that has all
+ these properties. This section also introduces some of the technical
+ terminology used in the remainder of the document.
+
+1.2. Customer Edge and Provider Edge
+
+ Routers can be attached to each other, or to end systems, in a
+ variety of different ways: PPP connections, ATM Virtual Circuits
+ (VCs), Frame Relay VCs, ethernet interfaces, Virtual Local Area
+ Networks (VLANs) on ethernet interfaces, GRE tunnels, Layer 2
+ Tunneling Protocol (L2TP) tunnels, IPsec tunnels, etc. We will use
+ the term "attachment circuit" to refer generally to some such means
+ of attaching to a router. An attachment circuit may be the sort of
+ connection that is usually thought of as a "data link", or it may be
+ a tunnel of some sort; what matters is that it be possible for two
+ devices to be network layer peers over the attachment circuit.
+
+ Each VPN site must contain one or more Customer Edge (CE) devices.
+ Each CE device is attached, via some sort of attachment circuit, to
+ one or more Provider Edge (PE) routers.
+
+ Routers in the SP's network that do not attach to CE devices are
+ known as "P routers".
+
+ CE devices can be hosts or routers. In a typical case, a site
+ contains one or more routers, some of which are attached to PE
+ routers. The site routers that attach to the PE routers would then
+ be the CE devices, or "CE routers". However, there is nothing to
+ prevent a non-routing host from attaching directly to a PE router, in
+ which case the host would be a CE device.
+
+
+
+Rosen & Rekhter Standards Track [Page 5]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Sometimes, what is physically attached to a PE router is a layer 2
+ switch. In this case, we do NOT say that the layer 2 switch is a CE
+ device. Rather, the CE devices are the hosts and routers that
+ communicate with the PE router through the layer 2 switch; the layer
+ 2 infrastructure is transparent. If the layer 2 infrastructure
+ provides a multipoint service, then multiple CE devices can be
+ attached to the PE router over the same attachment circuit.
+
+ CE devices are logically part of a customer's VPN. PE and P routers
+ are logically part of the SP's network.
+
+ The attachment circuit over which a packet travels when going from CE
+ to PE is known as that packet's "ingress attachment circuit", and the
+ PE as the packet's "ingress PE". The attachment circuit over which a
+ packet travels when going from PE to CE is known as that packet's
+ "egress attachment circuit", and the PE as the packet's "egress PE".
+
+ We will say that a PE router is attached to a particular VPN if it is
+ attached to a CE device that is in a site of that VPN. Similarly, we
+ will say that a PE router is attached to a particular site if it is
+ attached to a CE device that is in that site.
+
+ When the CE device is a router, it is a routing peer of the PE(s) to
+ which it is attached, but it is NOT a routing peer of CE routers at
+ other sites. Routers at different sites do not directly exchange
+ routing information with each other; in fact, they do not even need
+ to know of each other at all. As a consequence, the customer has no
+ backbone or "virtual backbone" to manage, and does not have to deal
+ with any inter-site routing issues. In other words, in the scheme
+ described in this document, a VPN is NOT an "overlay" on top of the
+ SP's network.
+
+ With respect to the management of the edge devices, clear
+ administrative boundaries are maintained between the SP and its
+ customers. Customers are not required to access the PE or P routers
+ for management purposes, nor is the SP required to access the CE
+ devices for management purposes.
+
+1.3. VPNs with Overlapping Address Spaces
+
+ If two VPNs have no sites in common, then they may have overlapping
+ address spaces. That is, a given address might be used in VPN V1 as
+ the address of system S1, but in VPN V2 as the address of a
+ completely different system S2. This is a common situation when the
+ VPNs each use an RFC 1918 private address space. Of course, within
+ each VPN, each address must be unambiguous.
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 6]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Even two VPNs that do have sites in common may have overlapping
+ address spaces, as long as there is no need for any communication
+ between systems with such addresses and systems in the common sites.
+
+1.4. VPNs with Different Routes to the Same System
+
+ Although a site may be in multiple VPNs, it is not necessarily the
+ case that the route to a given system at that site should be the same
+ in all the VPNs. Suppose, for example, we have an intranet
+ consisting of sites A, B, and C, and an extranet consisting of A, B,
+ C, and the "foreign" site D. Suppose that at site A there is a
+ server, and we want clients from B, C, or D to be able to use that
+ server. Suppose also that at site B there is a firewall. We want
+ all the traffic from site D to the server to pass through the
+ firewall, so that traffic from the extranet can be access controlled.
+ However, we don't want traffic from C to pass through the firewall on
+ the way to the server, since this is intranet traffic.
+
+ It is possible to set up two routes to the server. One route, used
+ by sites B and C, takes the traffic directly to site A. The second
+ route, used by site D, takes the traffic instead to the firewall at
+ site B. If the firewall allows the traffic to pass, it then appears
+ to be traffic coming from site B, and follows the route to site A.
+
+1.5. SP Backbone Routers
+
+ The SP's backbone consists of the PE routers, as well as other
+ routers ("P routers") that do not attach to CE devices.
+
+ If every router in an SP's backbone had to maintain routing
+ information for all the VPNs supported by the SP, there would be
+ severe scalability problems; the number of sites that could be
+ supported would be limited by the amount of routing information that
+ could be held in a single router. It is important therefore that the
+ routing information about a particular VPN only needs to be present
+ in the PE routers that attach to that VPN. In particular, the P
+ routers do not need to have ANY per-VPN routing information
+ whatsoever. (This condition may need to be relaxed somewhat when
+ multicast routing is considered. This is not considered further in
+ this paper, but is examined in [VPN-MCAST].)
+
+ So just as the VPN owners do not have a backbone or "virtual
+ backbone" to administer, the SPs themselves do not have a separate
+ backbone or "virtual backbone" to administer for each VPN. Site-to-
+ site routing in the backbone is optimal (within the constraints of
+ the policies used to form the VPNs) and is not constrained in any way
+ by an artificial "virtual topology" of tunnels.
+
+
+
+
+Rosen & Rekhter Standards Track [Page 7]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Section 10 discusses some of the special issues that arise when the
+ backbone spans several Service Providers.
+
+1.6. Security
+
+ VPNs of the sort being discussed here, even without making use of
+ cryptographic security measures, are intended to provide a level of
+ security equivalent to that obtainable when a layer 2 backbone (e.g.,
+ Frame Relay) is used. That is, in the absence of misconfiguration or
+ deliberate interconnection of different VPNs, it is not possible for
+ systems in one VPN to gain access to systems in another VPN. Of
+ course, the methods described herein do not by themselves encrypt the
+ data for privacy, nor do they provide a way to determine whether data
+ has been tampered with en route. If this is desired, cryptographic
+ measures must be applied in addition. (See, e.g., [MPLS/BGP-IPsec].)
+ Security is discussed in more detail in Section 13.
+
+2. Sites and CEs
+
+ From the perspective of a particular backbone network, a set of IP
+ systems may be regarded as a "site" if those systems have mutual IP
+ interconnectivity that doesn't require use of the backbone. In
+ general, a site will consist of a set of systems that are in
+ geographic proximity. However, this is not universally true. If two
+ geographic locations are connected via a leased line, over which Open
+ Shortest Path First (OSPF) protocol [OSPFv2] is running, and if that
+ line is the preferred way of communicating between the two locations,
+ then the two locations can be regarded as a single site, even if each
+ location has its own CE router. (This notion of "site" is
+ topological, rather than geographical. If the leased line goes down,
+ or otherwise ceases to be the preferred route, but the two geographic
+ locations can continue to communicate by using the VPN backbone, then
+ one site has become two.)
+
+ A CE device is always regarded as being in a single site (though as
+ we shall see in Section 3.2, a site may consist of multiple "virtual
+ sites"). A site, however, may belong to multiple VPNs.
+
+ A PE router may attach to CE devices from any number of different
+ sites, whether those CE devices are in the same or in different VPNs.
+ A CE device may, for robustness, attach to multiple PE routers, of
+ the same or of different service providers. If the CE device is a
+ router, the PE router and the CE router will appear as router
+ adjacencies to each other.
+
+ While we speak mostly of "sites" as being the basic unit of
+ interconnection, nothing here prevents a finer degree of granularity
+ in the control of interconnectivity. For example, certain systems at
+
+
+
+Rosen & Rekhter Standards Track [Page 8]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ a site may be members of an intranet as well as members of one or
+ more extranets, while other systems at the same site may be
+ restricted to being members of the intranet only. However, this
+ might require that the site have two attachment circuits to the
+ backbone, one for the intranet and one for the extranet; it might
+ further require that firewall functionality be applied on the
+ extranet attachment circuit.
+
+3. VRFs: Multiple Forwarding Tables in PEs
+
+ Each PE router maintains a number of separate forwarding tables. One
+ of the forwarding tables is the "default forwarding table". The
+ others are "VPN Routing and Forwarding tables", or "VRFs".
+
+3.1. VRFs and Attachment Circuits
+
+ Every PE/CE attachment circuit is associated, by configuration, with
+ one or more VRFs. An attachment circuit that is associated with a
+ VRF is known as a "VRF attachment circuit".
+
+ In the simplest case and most typical case, a PE/CE attachment
+ circuit is associated with exactly one VRF. When an IP packet is
+ received over a particular attachment circuit, its destination IP
+ address is looked up in the associated VRF. The result of that
+ lookup determines how to route the packet. The VRF used by a
+ packet's ingress PE for routing a particular packet is known as the
+ packet's "ingress VRF". (There is also the notion of a packet's
+ "egress VRF", located at the packet's egress PE; this is discussed in
+ Section 5.)
+
+ If an IP packet arrives over an attachment circuit that is not
+ associated with any VRF, the packet's destination address is looked
+ up in the default forwarding table, and the packet is routed
+ accordingly. Packets forwarded according to the default forwarding
+ table include packets from neighboring P or PE routers, as well as
+ packets from customer-facing attachment circuits that have not been
+ associated with VRFs.
+
+ Intuitively, one can think of the default forwarding table as
+ containing "public routes", and of the VRFs as containing "private
+ routes". One can similarly think of VRF attachment circuits as being
+ "private", and of non-VRF attachment circuits as being "public".
+
+ If a particular VRF attachment circuit connects site S to a PE
+ router, then connectivity from S (via that attachment circuit) can be
+ restricted by controlling the set of routes that gets entered in the
+ corresponding VRF. The set of routes in that VRF should be limited
+ to the set of routes leading to sites that have at least one VPN in
+
+
+
+Rosen & Rekhter Standards Track [Page 9]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ common with S. Then a packet sent from S over a VRF attachment
+ circuit can only be routed by the PE to another site S' if S' is in
+ one of the same VPNs as S. That is, communication (via PE routers)
+ is prevented between any pair of VPN sites that have no VPN in
+ common. Communication between VPN sites and non-VPN sites is
+ prevented by keeping the routes to the VPN sites out of the default
+ forwarding table.
+
+ If there are multiple attachment circuits leading from S to one or
+ more PE routers, then there might be multiple VRFs that could be used
+ to route traffic from S. To properly restrict S's connectivity, the
+ same set of routes would have to exist in all the VRFs.
+ Alternatively, one could impose different connectivity restrictions
+ over different attachment circuit from S. In that case, some of the
+ VRFs associated with attachment circuits from S would contain
+ different sets of routes than some of the others.
+
+ We allow the case in which a single attachment circuit is associated
+ with a set of VRFs, rather than with a single VRF. This can be
+ useful if it is desired to divide a single VPN into several
+ "sub-VPNs", each with different connectivity restrictions, where some
+ characteristic of the customer packets is used to select from among
+ the sub-VPNs. For simplicity though, we will usually speak of an
+ attachment circuit as being associated with a single VRF.
+
+3.2. Associating IP Packets with VRFs
+
+ When a PE router receives a packet from a CE device, it must
+ determine the attachment circuit over which the packet arrived, as
+ this determines in turn the VRF (or set of VRFs) that can be used for
+ forwarding that packet. In general, to determine the attachment
+ circuit over which a packet arrived, a PE router takes note of the
+ physical interface over which the packet arrived, and possibly also
+ takes note of some aspect of the packet's layer 2 header. For
+ example, if a packet's ingress attachment circuit is a Frame Relay
+ VC, the identity of the attachment circuit can be determined from the
+ physical Frame Relay interface over which the packet arrived,
+ together with the Data Link Connection Identifier (DLCI) field in the
+ packet's Frame Relay header.
+
+ Although the PE's conclusion that a particular packet arrived on a
+ particular attachment circuit may be partially determined by the
+ packet's layer 2 header, it must be impossible for a customer, by
+ writing the header fields, to fool the SP into thinking that a packet
+ that was received over one attachment circuit really arrived over a
+ different one. In the example above, although the attachment circuit
+ is determined partially by inspection of the DLCI field in the Frame
+ Relay header, this field cannot be set freely by the customer.
+
+
+
+Rosen & Rekhter Standards Track [Page 10]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Rather, it must be set to a value specified by the SP, or else the
+ packet cannot arrive at the PE router.
+
+ In some cases, a particular site may be divided by the customer into
+ several "virtual sites". The SP may designate a particular set of
+ VRFs to be used for routing packets from that site and may allow the
+ customer to set some characteristic of the packet, which is then used
+ for choosing a particular VRF from the set.
+
+ For example, each virtual site might be realized as a VLAN. The SP
+ and the customer could agree that on packets arriving from a
+ particular CE, certain VLAN values would be used to identify certain
+ VRFs. Of course, packets from that CE would be discarded by the PE
+ if they carry VLAN tag values that are not in the agreed-upon set.
+ Another way to accomplish this is to use IP source addresses. In
+ this case, the PE uses the IP source address in a packet received
+ from the CE, along with the interface over which the packet is
+ received, to assign the packet to a particular VRF. Again, the
+ customer would only be able to select from among the particular set
+ of VRFs that that customer is allowed to use.
+
+ If it is desired to have a particular host be in multiple virtual
+ sites, then that host must determine, for each packet, which virtual
+ site the packet is associated with. It can do this, e.g., by sending
+ packets from different virtual sites on different VLANs, or out
+ different network interfaces.
+
+3.3. Populating the VRFs
+
+ With what set of routes are the VRFs populated?
+
+ As an example, let PE1, PE2, and PE3 be three PE routers, and let
+ CE1, CE2, and CE3 be three CE routers. Suppose that PE1 learns, from
+ CE1, the routes that are reachable at CE1's site. If PE2 and PE3 are
+ attached, respectively, to CE2 and CE3, and there is some VPN V
+ containing CE1, CE2, and CE3, then PE1 uses BGP to distribute to PE2
+ and PE3 the routes that it has learned from CE1. PE2 and PE3 use
+ these routes to populate the VRFs that they associate, respectively,
+ with the sites of CE2 and CE3. Routes from sites that are not in VPN
+ V do not appear in these VRFs, which means that packets from CE2 or
+ CE3 cannot be sent to sites that are not in VPN V.
+
+ When we speak of a PE "learning" routes from a CE, we are not
+ presupposing any particular learning technique. The PE may learn
+ routes by means of a dynamic routing algorithm, but it may also
+ "learn" routes by having those routes configured (i.e., static
+ routing). (In this case, to say that the PE "learned" the routes
+ from the CE is perhaps to exercise a bit of poetic license.)
+
+
+
+Rosen & Rekhter Standards Track [Page 11]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ PEs also need to learn, from other PEs, the routes that belong to a
+ given VPN. The procedures to be used for populating the VRFs with
+ the proper sets of routes are specified in Section 4.
+
+ If there are multiple attachment circuits leading from a particular
+ PE router to a particular site, they might all be mapped to the same
+ forwarding table. But if policy dictates, they could be mapped to
+ different forwarding tables. For instance, the policy might be that
+ a particular attachment circuit from a site is used only for intranet
+ traffic, while another attachment circuit from that site is used only
+ for extranet traffic. (Perhaps, e.g., the CE attached to the
+ extranet attachment circuit is a firewall, while the CE attached to
+ the intranet attachment circuit is not.) In this case, the two
+ attachment circuits would be associated with different VRFs.
+
+ Note that if two attachment circuits are associated with the same
+ VRF, then packets that the PE receives over one of them will be able
+ to reach exactly the same set of destinations as packets that the PE
+ receives over the other. So two attachment circuits cannot be
+ associated with the same VRF unless each CE is in the exact same set
+ of VPNs as is the other.
+
+ If an attachment circuit leads to a site which is in multiple VPNs,
+ the attachment circuit may still associated with a single VRF, in
+ which case the VRF will contain routes from the full set of VPNs of
+ which the site is a member.
+
+4. VPN Route Distribution via BGP
+
+ PE routers use BGP to distribute VPN routes to each other (more
+ accurately, to cause VPN routes to be distributed to each other).
+
+ We allow each VPN to have its own address space, which means that a
+ given address may denote different systems in different VPNs. If two
+ routes to the same IP address prefix are actually routes to different
+ systems, it is important to ensure that BGP not treat them as
+ comparable. Otherwise, BGP might choose to install only one of them,
+ making the other system unreachable. Further, we must ensure that
+ POLICY is used to determine which packets get sent on which routes;
+ given that several such routes are installed by BGP, only one such
+ must appear in any particular VRF.
+
+ We meet these goals by the use of a new address family, as specified
+ below.
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 12]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+4.1. The VPN-IPv4 Address Family
+
+ The BGP Multiprotocol Extensions [BGP-MP] allow BGP to carry routes
+ from multiple "address families". We introduce the notion of the
+ "VPN-IPv4 address family". A VPN-IPv4 address is a 12-byte quantity,
+ beginning with an 8-byte Route Distinguisher (RD) and ending with a
+ 4-byte IPv4 address. If several VPNs use the same IPv4 address
+ prefix, the PEs translate these into unique VPN-IPv4 address
+ prefixes. This ensures that if the same address is used in several
+ different VPNs, it is possible for BGP to carry several completely
+ different routes to that address, one for each VPN.
+
+ Since VPN-IPv4 addresses and IPv4 addresses are different address
+ families, BGP never treats them as comparable addresses.
+
+ An RD is simply a number, and it does not contain any inherent
+ information; it does not identify the origin of the route or the set
+ of VPNs to which the route is to be distributed. The purpose of the
+ RD is solely to allow one to create distinct routes to a common IPv4
+ address prefix. Other means are used to determine where to
+ redistribute the route (see Section 4.3).
+
+ The RD can also be used to create multiple different routes to the
+ very same system. We have already discussed a situation in which the
+ route to a particular server should be different for intranet traffic
+ than for extranet traffic. This can be achieved by creating two
+ different VPN-IPv4 routes that have the same IPv4 part, but different
+ RDs. This allows BGP to install multiple different routes to the
+ same system, and allows policy to be used (see Section 4.3.5) to
+ decide which packets use which route.
+
+ The RDs are structured so that every Service Provider can administer
+ its own "numbering space" (i.e., can make its own assignments of
+ RDs), without conflicting with the RD assignments made by any other
+ Service Provider. An RD consists of three fields: a 2-byte type
+ field, an administrator field, and an assigned number field. The
+ value of the type field determines the lengths of the other two
+ fields, as well as the semantics of the administrator field. The
+ administrator field identifies an assigned number authority, and the
+ assigned number field contains a number that has been assigned, by
+ the identified authority, for a particular purpose. For example, one
+ could have an RD whose administrator field contains an Autonomous
+ System number (ASN), and whose (4-byte) number field contains a
+ number assigned by the SP to whom that ASN belongs (having been
+ assigned to that SP by the appropriate authority).
+
+ RDs are given this structure in order to ensure that an SP that
+ provides VPN backbone service can always create a unique RD when it
+
+
+
+Rosen & Rekhter Standards Track [Page 13]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ needs to do so. However, the structure is not meaningful to BGP;
+ when BGP compares two such address prefixes, it ignores the structure
+ entirely.
+
+ A PE needs to be configured such that routes that lead to a
+ particular CE become associated with a particular RD. The
+ configuration may cause all routes leading to the same CE to be
+ associated with the same RD, or it may cause different routes to be
+ associated with different RDs, even if they lead to the same CE.
+
+4.2. Encoding of Route Distinguishers
+
+ As stated, a VPN-IPv4 address consists of an 8-byte Route
+ Distinguisher followed by a 4-byte IPv4 address. The RDs are encoded
+ as follows:
+
+ - Type Field: 2 bytes
+ - Value Field: 6 bytes
+
+ The interpretation of the Value field depends on the value of the
+ type field. At the present time, three values of the type field are
+ defined: 0, 1, and 2.
+
+ - Type 0: The Value field consists of two subfields:
+
+ * Administrator subfield: 2 bytes
+ * Assigned Number subfield: 4 bytes
+
+ The Administrator subfield must contain an Autonomous System
+ number. If this ASN is from the public ASN space, it must have
+ been assigned by the appropriate authority (use of ASN values
+ from the private ASN space is strongly discouraged). The
+ Assigned Number subfield contains a number from a numbering space
+ that is administered by the enterprise to which the ASN has been
+ assigned by an appropriate authority.
+
+ - Type 1: The Value field consists of two subfields:
+
+ * Administrator subfield: 4 bytes
+ * Assigned Number subfield: 2 bytes
+
+ The Administrator subfield must contain an IP address. If this
+ IP address is from the public IP address space, it must have been
+ assigned by an appropriate authority (use of addresses from the
+ private IP address space is strongly discouraged). The Assigned
+ Number subfield contains a number from a numbering space which is
+ administered by the enterprise to which the IP address has been
+ assigned.
+
+
+
+Rosen & Rekhter Standards Track [Page 14]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ - Type 2: The Value field consists of two subfields:
+
+ * Administrator subfield: 4 bytes
+ * Assigned Number subfield: 2 bytes
+
+ The Administrator subfield must contain a 4-byte Autonomous
+ System number [BGP-AS4]. If this ASN is from the public ASN
+ space, it must have been assigned by the appropriate authority
+ (use of ASN values from the private ASN space is strongly
+ discouraged). The Assigned Number subfield contains a number
+ from a numbering space which is administered by the enterprise to
+ which the ASN has been assigned by an appropriate authority.
+
+4.3. Controlling Route Distribution
+
+ In this section, we discuss the way in which the distribution of the
+ VPN-IPv4 routes is controlled.
+
+ If a PE router is attached to a particular VPN (by being attached to
+ a particular CE in that VPN), it learns some of that VPN's IP routes
+ from the attached CE router. Routes learned from a CE routing peer
+ over a particular attachment circuit may be installed in the VRF
+ associated with that attachment circuit. Exactly which routes are
+ installed in this manner is determined by the way in which the PE
+ learns routes from the CE. In particular, when the PE and CE are
+ routing protocol peers, this is determined by the decision process of
+ the routing protocol; this is discussed in Section 7.
+
+ These routes are then converted to VPN-IP4 routes, and "exported" to
+ BGP. If there is more than one route to a particular VPN-IP4 address
+ prefix, BGP chooses the "best" one, using the BGP decision process.
+ That route is then distributed by BGP to the set of other PEs that
+ need to know about it. At these other PEs, BGP will again choose the
+ best route for a particular VPN-IP4 address prefix. Then the chosen
+ VPN-IP4 routes are converted back into IP routes, and "imported" into
+ one or more VRFs. Whether they are actually installed in the VRFs
+ depends on the decision process of the routing method used between
+ the PE and those CEs that are associated with the VRF in question.
+ Finally, any route installed in a VRF may be distributed to the
+ associated CE routers.
+
+4.3.1. The Route Target Attribute
+
+ Every VRF is associated with one or more Route Target (RT)
+ attributes.
+
+ When a VPN-IPv4 route is created (from an IPv4 route that the PE has
+ learned from a CE) by a PE router, it is associated with one or more
+
+
+
+Rosen & Rekhter Standards Track [Page 15]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Route Target attributes. These are carried in BGP as attributes of
+ the route.
+
+ Any route associated with Route Target T must be distributed to every
+ PE router that has a VRF associated with Route Target T. When such a
+ route is received by a PE router, it is eligible to be installed in
+ those of the PE's VRFs that are associated with Route Target T.
+ (Whether it actually gets installed depends upon the outcome of the
+ BGP decision process, and upon the outcome of the decision process of
+ the IGP (i.e., the intra-domain routing protocol) running on the
+ PE/CE interface.)
+
+ A Route Target attribute can be thought of as identifying a set of
+ sites. (Though it would be more precise to think of it as
+ identifying a set of VRFs.) Associating a particular Route Target
+ attribute with a route allows that route to be placed in the VRFs
+ that are used for routing traffic that is received from the
+ corresponding sites.
+
+ There is a set of Route Targets that a PE router attaches to a route
+ received from site S; these may be called the "Export Targets". And
+ there is a set of Route Targets that a PE router uses to determine
+ whether a route received from another PE router could be placed in
+ the VRF associated with site S; these may be called the "Import
+ Targets". The two sets are distinct, and need not be the same. Note
+ that a particular VPN-IPv4 route is only eligible for installation in
+ a particular VRF if there is some Route Target that is both one of
+ the route's Route Targets and one of the VRF's Import Targets.
+
+ The function performed by the Route Target attribute is similar to
+ that performed by the BGP Communities attribute. However, the format
+ of the latter is inadequate for present purposes, since it allows
+ only a 2-byte numbering space. It is desirable to structure the
+ format, similar to what we have described for RDs (see Section 4.2),
+ so that a type field defines the length of an administrator field,
+ and the remainder of the attribute is a number from the specified
+ administrator's numbering space. This can be done using BGP Extended
+ Communities. The Route Targets discussed herein are encoded as BGP
+ Extended Community Route Targets [BGP-EXTCOMM]. They are structured
+ similarly to the RDs.
+
+ When a BGP speaker has received more than one route to the same VPN-
+ IPv4 prefix, the BGP rules for route preference are used to choose
+ which VPN-IPv4 route is installed by BGP.
+
+ Note that a route can only have one RD, but it can have multiple
+ Route Targets. In BGP, scalability is improved if one has a single
+ route with multiple attributes, as opposed to multiple routes. One
+
+
+
+Rosen & Rekhter Standards Track [Page 16]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ could eliminate the Route Target attribute by creating more routes
+ (i.e., using more RDs), but the scaling properties would be less
+ favorable.
+
+ How does a PE determine which Route Target attributes to associate
+ with a given route? There are a number of different possible ways.
+ The PE might be configured to associate all routes that lead to a
+ specified site with a specified Route Target. Or the PE might be
+ configured to associate certain routes leading to a specified site
+ with one Route Target, and certain with another.
+
+ If the PE and the CE are themselves BGP peers (see Section 7), then
+ the SP may allow the customer, within limits, to specify how its
+ routes are to be distributed. The SP and the customer would need to
+ agree in advance on the set of RTs that are allowed to be attached to
+ the customer's VPN routes. The CE could then attach one or more of
+ those RTs to each IP route that it distributes to the PE. This gives
+ the customer the freedom to specify in real time, within agreed-upon
+ limits, its route distribution policies. If the CE is allowed to
+ attach RTs to its routes, the PE MUST filter out all routes that
+ contain RTs that the customer is not allowed to use. If the CE is
+ not allowed to attach RTs to its routes, but does so anyway, the PE
+ MUST remove the RT before converting the customer's route to a VPN-
+ IPv4 route.
+
+4.3.2. Route Distribution Among PEs by BGP
+
+ If two sites of a VPN attach to PEs that are in the same Autonomous
+ System, the PEs can distribute VPN-IPv4 routes to each other by means
+ of an IBGP connection between them. (The term "IBGP" refers to the
+ set of protocols and procedures used when there is a BGP connection
+ between two BGP speakers in the same Autonomous System. This is
+ distinguished from "EBGP", the set of procedures used between two BGP
+ speakers in different Autonomous Systems.) Alternatively, each can
+ have an IBGP connection to a route reflector [BGP-RR].
+
+ When a PE router distributes a VPN-IPv4 route via BGP, it uses its
+ own address as the "BGP next hop". This address is encoded as a
+ VPN-IPv4 address with an RD of 0. ([BGP-MP] requires that the next
+ hop address be in the same address family as the Network Layer
+ Reachability Information (NLRI).) It also assigns and distributes an
+ MPLS label. (Essentially, PE routers distribute not VPN-IPv4 routes,
+ but Labeled VPN-IPv4 routes. Cf. [MPLS-BGP].) When the PE processes
+ a received packet that has this label at the top of the stack, the PE
+ will pop the stack, and process the packet appropriately.
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 17]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ The PE may distribute the exact set of routes that appears in the
+ VRF, or it may perform summarization and distribute aggregates of
+ those routes, or it may do some of one and some of the other.
+
+ Suppose that a PE has assigned label L to route R, and has
+ distributed this label mapping via BGP. If R is an aggregate of a
+ set of routes in the VRF, the PE will know that packets from the
+ backbone that arrive with this label must have their destination
+ addresses looked up in a VRF. When the PE looks up the label in its
+ Label Information Base, it learns which VRF must be used. On the
+ other hand, if R is not an aggregate, then when the PE looks up the
+ label, it learns the egress attachment circuit, as well as the
+ encapsulation header for the packet. In this case, no lookup in the
+ VRF is done.
+
+ We would expect that the most common case would be the case where the
+ route is NOT an aggregate. The case where it is an aggregate can be
+ very useful though if the VRF contains a large number of host routes
+ (e.g., as in dial-in), or if the VRF has an associated Local Area
+ Network (LAN) interface (where there is a different outgoing layer 2
+ header for each system on the LAN, but a route is not distributed for
+ each such system).
+
+ Whether or not each route has a distinct label is an implementation
+ matter. There are a number of possible algorithms one could use to
+ determine whether two routes get assigned the same label:
+
+ - One may choose to have a single label for an entire VRF, so that
+ a single label is shared by all the routes from that VRF. Then
+ when the egress PE receives a packet with that label, it must
+ look up the packet's IP destination address in that VRF (the
+ packet's "egress VRF"), in order to determine the packet's egress
+ attachment circuit and the corresponding data link encapsulation.
+
+ - One may choose to have a single label for each attachment
+ circuit, so that a single label is shared by all the routes with
+ the same "outgoing attachment circuit". This enables one to
+ avoid doing a lookup in the egress VRF, though some sort of
+ lookup may need to be done in order to determine the data link
+ encapsulation, e.g., an Address Resolution Protocol (ARP) lookup.
+
+ - One may choose to have a distinct label for each route. Then if
+ a route is potentially reachable over more than one attachment
+ circuit, the PE/CE routing can switch the preferred path for a
+ route from one attachment circuit to another, without there being
+ any need to distribute new a label for that route.
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 18]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ There may be other possible algorithms as well. The choice of
+ algorithm is entirely at the discretion of the egress PE, and is
+ otherwise transparent.
+
+ In using BGP-distributed MPLS labels in this manner, we presuppose
+ that an MPLS packet carrying such a label can be tunneled from the
+ router that installs the corresponding BGP-distributed route to the
+ router that is the BGP next hop of that route. This requires either
+ that a label switched path exist between those two routers or else
+ that some other tunneling technology (e.g., [MPLS-in-IP-GRE]) can be
+ used between them.
+
+ This tunnel may follow a "best effort" route, or it may follow a
+ traffic-engineered route. Between a given pair of routers, there may
+ be one such tunnel, or there may be several, perhaps with different
+ Quality of Service (QoS) characteristics. All that matters for the
+ VPN architecture is that some such tunnel exists. To ensure
+ interoperability among systems that implement this VPN architecture
+ using MPLS label switched paths as the tunneling technology, all such
+ systems MUST support Label Distribution Protocol (LDP) [MPLS-LDP].
+ In particular, Downstream Unsolicited mode MUST be supported on
+ interfaces that are neither Label Controlled ATM (LC-ATM) [MPLS-ATM]
+ nor Label Controlled Frame Relay (LC-FR) [MPLS-FR] interfaces, and
+ Downstream on Demand mode MUST be supported on LC-ATM interfaces and
+ LC-FR interfaces.
+
+ If the tunnel follows a best-effort route, then the PE finds the
+ route to the remote endpoint by looking up its IP address in the
+ default forwarding table.
+
+ A PE router, UNLESS it is a route reflector (see Section 4.3.3) or an
+ Autonomous System Border Router (ASBR) for an inter-provider VPN (see
+ Section 10), should not install a VPN-IPv4 route unless it has at
+ least one VRF with an Import Target identical to one of the route's
+ Route Target attributes. Inbound filtering should be used to cause
+ such routes to be discarded. If a new Import Target is later added
+ to one of the PE's VRFs (a "VPN Join" operation), it must then
+ acquire the routes it may previously have discarded. This can be
+ done using the refresh mechanism described in [BGP-RFSH]. The
+ outbound route filtering mechanism of [BGP-ORF] can also be used to
+ advantage to make the filtering more dynamic.
+
+ Similarly, if a particular Import Target is no longer present in any
+ of a PE's VRFs (as a result of one or more "VPN Prune" operations),
+ the PE may discard all routes that, as a result, no longer have any
+ of the PE's VRF's Import Targets as one of their Route Target
+ attributes.
+
+
+
+
+Rosen & Rekhter Standards Track [Page 19]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ A router that is not attached to any VPN and that is not a Route
+ Reflector (i.e., a P router) never installs any VPN-IPv4 routes at
+ all.
+
+ Note that VPN Join and Prune operations are non-disruptive and do not
+ require any BGP connections to be brought down, as long as the
+ refresh mechanism of [BGP-RFSH] is used.
+
+ As a result of these distribution rules, no one PE ever needs to
+ maintain all routes for all VPNs; this is an important scalability
+ consideration.
+
+4.3.3. Use of Route Reflectors
+
+ Rather than having a complete IBGP mesh among the PEs, it is
+ advantageous to make use of BGP Route Reflectors [BGP-RR] to improve
+ scalability. All the usual techniques for using route reflectors to
+ improve scalability (e.g., route reflector hierarchies) are
+ available.
+
+ Route reflectors are the only systems that need to have routing
+ information for VPNs to which they are not directly attached.
+ However, there is no need to have any one route reflector know all
+ the VPN-IPv4 routes for all the VPNs supported by the backbone.
+
+ We outline below two different ways to partition the set of VPN-IPv4
+ routes among a set of route reflectors.
+
+ 1. Each route reflector is preconfigured with a list of Route
+ Targets. For redundancy, more than one route reflector may be
+ preconfigured with the same list. A route reflector uses the
+ preconfigured list of Route Targets to construct its inbound
+ route filtering. The route reflector may use the techniques of
+ [BGP-ORF] to install on each of its peers (regardless of
+ whether the peer is another route reflector or a PE) the set of
+ Outbound Route Filters (ORFs) that contains the list of its
+ preconfigured Route Targets. Note that route reflectors should
+ accept ORFs from other route reflectors, which means that route
+ reflectors should advertise the ORF capability to other route
+ reflectors.
+
+ A service provider may modify the list of preconfigured Route
+ Targets on a route reflector. When this is done, the route
+ reflector modifies the ORFs it installs on all of its IBGP
+ peers. To reduce the frequency of configuration changes on
+ route reflectors, each route reflector may be preconfigured
+ with a block of Route Targets. This way, when a new Route
+ Target is needed for a new VPN, there is already one or more
+
+
+
+Rosen & Rekhter Standards Track [Page 20]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ route reflectors that are (pre)configured with this Route
+ Target.
+
+ Unless a given PE is a client of all route reflectors, when a
+ new VPN is added to the PE ("VPN Join"), it will need to become
+ a client of the route reflector(s) that maintain routes for
+ that VPN. Likewise, deleting an existing VPN from the PE ("VPN
+ Prune") may result in a situation where the PE no longer needs
+ to be a client of some route reflector(s). In either case, the
+ Join or Prune operation is non-disruptive (as long as
+ [BGP-RFSH] is used, and never requires a BGP connection to be
+ brought down, only to be brought right back up.
+
+ (By "adding a new VPN to a PE", we really mean adding a new
+ import Route Target to one of its VRFs, or adding a new VRF
+ with an import Route Target not had by any of the PE's other
+ VRFs.)
+
+ 2. Another method is to have each PE be a client of some subset of
+ the route reflectors. A route reflector is not preconfigured
+ with the list of Route Targets, and does not perform inbound
+ route filtering of routes received from its clients (PEs);
+ rather, it accepts all the routes received from all of its
+ clients (PEs). The route reflector keeps track of the set of
+ the Route Targets carried by all the routes it receives. When
+ the route reflector receives from its client a route with a
+ Route Target that is not in this set, this Route Target is
+ immediately added to the set. On the other hand, when the
+ route reflector no longer has any routes with a particular
+ Route Target that is in the set, the route reflector should
+ delay (by a few hours) the deletion of this Route Target from
+ the set.
+
+ The route reflector uses this set to form the inbound route
+ filters that it applies to routes received from other route
+ reflectors. The route reflector may also use ORFs to install
+ the appropriate outbound route filtering on other route
+ reflectors. Just like with the first approach, a route
+ reflector should accept ORFs from other route reflectors. To
+ accomplish this, a route reflector advertises ORF capability to
+ other route reflectors.
+
+ When the route reflector changes the set, it should immediately
+ change its inbound route filtering. In addition, if the route
+ reflector uses ORFs, then the ORFs have to be immediately
+ changed to reflect the changes in the set. If the route
+ reflector doesn't use ORFs, and a new Route Target is added to
+
+
+
+
+Rosen & Rekhter Standards Track [Page 21]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ the set, the route reflector, after changing its inbound route
+ filtering, must issue BGP Refresh to other route reflectors.
+
+ The delay of "a few hours" mentioned above allows a route
+ reflector to hold onto routes with a given RT, even after it
+ loses the last of its clients that are interested in such
+ routes. This protects against the need to reacquire all such
+ routes if the clients' "disappearance" is only temporary.
+
+ With this procedure, VPN Join and Prune operations are also
+ non-disruptive.
+
+ Note that this technique will not work properly if some client
+ PE has a VRF with an import Route Target that is not one of its
+ export Route Targets.
+
+ In these procedures, a PE router which attaches to a particular VPN
+ "auto-discovers" the other PEs that attach to the same VPN. When a
+ new PE router is added, or when an existing PE router attaches to a
+ new VPN, no reconfiguration of other PE routers is needed.
+
+ Just as there is no one PE router that needs to know all the VPN-IPv4
+ routes supported over the backbone, these distribution rules ensure
+ that there is no one Route Reflector (RR) that needs to know all the
+ VPN-IPv4 routes supported over the backbone. As a result, the total
+ number of such routes that can be supported over the backbone is not
+ bounded by the capacity of any single device, and therefore can
+ increase virtually without bound.
+
+4.3.4. How VPN-IPv4 NLRI Is Carried in BGP
+
+ The BGP Multiprotocol Extensions [BGP-MP] are used to encode the
+ NLRI. If the Address Family Identifier (AFI) field is set to 1, and
+ the Subsequent Address Family Identifier (SAFI) field is set to 128,
+ the NLRI is an MPLS-labeled VPN-IPv4 address. AFI 1 is used since
+ the network layer protocol associated with the NLRI is still IP.
+ Note that this VPN architecture does not require the capability to
+ distribute unlabeled VPN-IPv4 addresses.
+
+ In order for two BGP speakers to exchange labeled VPN-IPv4 NLRI, they
+ must use BGP Capabilities Advertisement to ensure that they both are
+ capable of properly processing such NLRI. This is done as specified
+ in [BGP-MP], by using capability code 1 (multiprotocol BGP), with an
+ AFI of 1 and an SAFI of 128.
+
+ The labeled VPN-IPv4 NLRI itself is encoded as specified in
+ [MPLS-BGP], where the prefix consists of an 8-byte RD followed by an
+ IPv4 prefix.
+
+
+
+Rosen & Rekhter Standards Track [Page 22]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+4.3.5. Building VPNs Using Route Targets
+
+ By setting up the Import Targets and Export Targets properly, one can
+ construct different kinds of VPNs.
+
+ Suppose it is desired to create a fully meshed closed user group,
+ i.e., a set of sites where each can send traffic directly to the
+ other, but traffic cannot be sent to or received from other sites.
+ Then each site is associated with a VRF, a single Route Target
+ attribute is chosen, that Route Target is assigned to each VRF as
+ both the Import Target and the Export Target, and that Route Target
+ is not assigned to any other VRFs as either the Import Target or the
+ Export Target.
+
+ Alternatively, suppose one desired, for whatever reason, to create a
+ "hub and spoke" kind of VPN. This could be done by the use of two
+ Route Target values, one meaning "Hub" and one meaning "Spoke". At
+ the VRFs attached to the hub sites, "Hub" is the Export Target and
+
+ "Spoke" is the Import Target. At the VRFs attached to the spoke
+ site, "Hub" is the Import Target and "Spoke" is the Export Target.
+
+ Thus, the methods for controlling the distribution of routing
+ information among various sets of sites are very flexible, which in
+ turn provides great flexibility in constructing VPNs.
+
+4.3.6. Route Distribution Among VRFs in a Single PE
+
+ It is possible to distribute routes from one VRF to another, even if
+ both VRFs are in the same PE, even though in this case one cannot say
+ that the route has been distributed by BGP. Nevertheless, the
+ decision to distribute a particular route from one VRF to another
+ within a single PE is the same decision that would be made if the
+ VRFs were on different PEs. That is, it depends on the Route Target
+ attribute that is assigned to the route (or would be assigned if the
+ route were distributed by BGP), and the import target of the second
+ VRF.
+
+5. Forwarding
+
+ If the intermediate routers in the backbone do not have any
+ information about the routes to the VPNs, how are packets forwarded
+ from one VPN site to another?
+
+ When a PE receives an IP packet from a CE device, it chooses a
+ particular VRF in which to look up the packet's destination address.
+ This choice is based on the packet's ingress attachment circuit.
+
+
+
+
+Rosen & Rekhter Standards Track [Page 23]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Assume that a match is found. As a result we learn the packet's
+ "next hop".
+
+ If the packet's next hop is reached directly over a VRF attachment
+ circuit from this PE (i.e., the packet's egress attachment circuit is
+ on the same PE as its ingress attachment circuit), then the packet is
+ sent on the egress attachment circuit, and no MPLS labels are pushed
+ onto the packet's label stack.
+
+ If the ingress and egress attachment circuits are on the same PE, but
+ are associated with different VRFs, and if the route that best
+ matches the destination address in the ingress attachment circuit's
+ VRF is an aggregate of several routes in the egress attachment
+ circuit's VRF, it may be necessary to look up the packet's
+ destination address in the egress VRF as well.
+
+ If the packet's next hop is NOT reached through a VRF attachment
+ circuit, then the packet must travel at least one hop through the
+ backbone. The packet thus has a "BGP Next Hop", and the BGP Next Hop
+ will have assigned an MPLS label for the route that best matches the
+ packet's destination address. Call this label the "VPN route label".
+ The IP packet is turned into an MPLS packet with the VPN route label
+ as the sole label on the label stack.
+
+ The packet must then be tunneled to the BGP Next Hop.
+
+ If the backbone supports MPLS, this is done as follows:
+
+ - The PE routers (and any Autonomous System border routers) that
+ redistribute VPN-IPv4 addresses need to insert /32 address
+ prefixes for themselves into the IGP routing tables of the
+ backbone. This enables MPLS, at each node in the backbone
+ network, to assign a label corresponding to the route to each PE
+ router. To ensure interoperability among different
+ implementations, it is required to support LDP for setting up the
+ label switched paths across the backbone. However, other methods
+ of setting up these label switched paths are also possible.
+ (Some of these other methods may not require the presence of the
+ /32 address prefixes in the IGP.)
+
+ - If there are any traffic engineering tunnels to the BGP next hop,
+ and if one or more of those is available for use by the packet in
+ question, one of these tunnels is chosen. This tunnel will be
+ associated with an MPLS label, the "tunnel label". The tunnel
+ label gets pushed on the MPLS label stack, and the packet is
+ forwarded to the tunnel's next hop.
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 24]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ - Otherwise,
+
+ * The packet will have an "IGP Next Hop", which is the next hop
+ along the IGP route to the BGP Next Hop.
+
+ * If the BGP Next Hop and the IGP Next Hop are the same, and if
+ penultimate hop popping is used, the packet is then sent to
+ the IGP Next Hop, carrying only the VPN route label.
+
+ * Otherwise, the IGP Next Hop will have assigned a label for
+ the route that best matches the address of the BGP Next Hop.
+ Call this the "tunnel label". The tunnel label gets pushed
+ on as the packet's top label. The packet is then forwarded
+ to the IGP Next Hop.
+
+ - MPLS will then carry the packet across the backbone to the BGP
+ Next Hop, where the VPN label will be examined.
+
+ If the backbone does not support MPLS, the MPLS packet carrying only
+ the VPN route label may be tunneled to the BGP Next Hop using the
+ techniques of [MPLS-in-IP-GRE]. When the packet emerges from the
+ tunnel, it will be at the BGP Next Hop, where the VPN route label
+ will be examined.
+
+ At the BGP Next Hop, the treatment of the packet depends on the VPN
+ route label (see Section 4.3.2). In many cases, the PE will be able
+ to determine, from this label, the attachment circuit over which the
+ packet should be transmitted (to a CE device), as well as the proper
+ data link layer header for that interface. In other cases, the PE
+ may only be able to determine that the packet's destination address
+ needs to be looked up in a particular VRF before being forwarded to a
+ CE device. There are also intermediate cases in which the VPN route
+ label may determine the packet's egress attachment circuit, but a
+ lookup (e.g., ARP) still needs to be done in order to determine the
+ packet's data link header on that attachment circuit.
+
+ Information in the MPLS header itself, and/or information associated
+ with the label, may also be used to provide QoS on the interface to
+ the CE.
+
+ In any event, if the packet was an unlabeled IP packet when it
+ arrived at its ingress PE, it will again be an unlabeled packet when
+ it leaves its egress PE.
+
+ The fact that packets with VPN route labels are tunneled through the
+ backbone is what makes it possible to keep all the VPN routes out of
+ the P routers. This is crucial to ensuring the scalability of the
+
+
+
+
+Rosen & Rekhter Standards Track [Page 25]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ scheme. The backbone does not even need to have routes to the CEs,
+ only to the PEs.
+
+ With respect to the tunnels, it is worth noting that this
+ specification:
+
+ - DOES NOT require that the tunnels be point-to-point; multipoint-
+ to-point can be used;
+
+ - DOES NOT require that there be any explicit setup of the tunnels,
+ either via signaling or via manual configuration;
+
+ - DOES NOT require that there be any tunnel-specific signaling;
+
+ - DOES NOT require that there be any tunnel-specific state in the P
+ or PE routers, beyond what is necessary to maintain the routing
+ information and (if used) the MPLS label information.
+
+ Of course, this specification is compatible with the use of point-
+ to-point tunnels that must be explicitly configured and/or signaled,
+ and in some situations there may be reasons for using such tunnels.
+
+ The considerations that are relevant to choosing a particular
+ tunneling technology are outside the scope of this specification.
+
+6. Maintaining Proper Isolation of VPNs
+
+ To maintain proper isolation of one VPN from another, it is important
+ that no router in the backbone accept a tunneled packet from outside
+ the backbone, unless it is sure that both endpoints of that tunnel
+ are outside the backbone.
+
+ If MPLS is being used as the tunneling technology, this means that a
+ router in the backbone MUST NOT accept a labeled packet from any
+ adjacent non-backbone device unless the following two conditions
+ hold:
+
+ 1. the label at the top of the label stack was actually
+ distributed by that backbone router to that non-backbone
+ device, and
+
+ 2. the backbone router can determine that use of that label will
+ cause the packet to leave the backbone before any labels lower
+ in the stack will be inspected, and before the IP header will
+ be inspected.
+
+ The first condition ensure that any labeled packets received from
+ non-backbone routers have a legitimate and properly assigned label at
+
+
+
+Rosen & Rekhter Standards Track [Page 26]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ the top of the label stack. The second condition ensures that the
+ backbone routers will never look below that top label. Of course,
+ the simplest way to meet these two conditions is just to have the
+ backbone devices refuse to accept labeled packets from non-backbone
+ devices.
+
+ If MPLS is not being used as the tunneling technology, then filtering
+ must be done to ensure that an MPLS-in-IP or MPLS-in-GRE packet can
+ be accepted into the backbone only if the packet's IP destination
+ address will cause it to be sent outside the backbone.
+
+7. How PEs Learn Routes from CEs
+
+ The PE routers that attach to a particular VPN need to know, for each
+ attachment circuit leading to that VPN, which of the VPN's addresses
+ should be reached over that attachment circuit.
+
+ The PE translates these addresses into VPN-IPv4 addresses, using a
+ configured RD. The PE then treats these VPN-IPv4 routes as input to
+ BGP. Routes from a VPN site are NOT leaked into the backbone's IGP.
+
+ Exactly which PE/CE route distribution techniques are possible
+ depends on whether or not a particular CE is in a "transit VPN". A
+ "transit VPN" is one that contains a router that receives routes from
+ a "third party" (i.e., from a router that is not in the VPN, but is
+ not a PE router) and that redistributes those routes to a PE router.
+ A VPN that is not a transit VPN is a "stub VPN". The vast majority
+ of VPNs, including just about all corporate enterprise networks,
+ would be expected to be "stubs" in this sense.
+
+ The possible PE/CE distribution techniques are:
+
+ 1. Static routing (i.e., configuration) may be used. (This is
+ likely to be useful only in stub VPNs.)
+
+ 2. PE and CE routers may be Routing Information Protocol (RIP)
+ [RIP] peers, and the CE may use RIP to tell the PE router the
+ set of address prefixes that are reachable at the CE router's
+ site. When RIP is configured in the CE, care must be taken to
+ ensure that address prefixes from other sites (i.e., address
+ prefixes learned by the CE router from the PE router) are never
+ advertised to the PE. More precisely: if a PE router, say,
+ PE1, receives a VPN-IPv4 route R1, and as a result distributes
+ an IPv4 route R2 to a CE, then R2 must not be distributed back
+ from that CE's site to a PE router, say, PE2, (where PE1 and
+ PE2 may be the same router or different routers), unless PE2
+ maps R2 to a VPN-IPv4 route that is different than (i.e.,
+ contains a different RD than) R1.
+
+
+
+Rosen & Rekhter Standards Track [Page 27]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ 3. The PE and CE routers may be OSPF peers. A PE router that is
+ an OSPF peer of a CE router appears, to the CE router, to be an
+ area 0 router. If a PE router is an OSPF peer of CE routers
+ that are in distinct VPNs, the PE must of course be running
+ multiple instances of OSPF.
+
+ IPv4 routes that the PE learns from the CE via OSPF are
+ redistributed into BGP as VPN-IPv4 routes. Extended Community
+ attributes are used to carry, along with the route, all the
+ information needed to enable the route to be distributed to
+ other CE routers in the VPN in the proper type of OSPF Link
+ State Advertisement (LSA). OSPF route tagging is used to
+ ensure that routes received from the MPLS/BGP backbone are not
+ sent back into the backbone.
+
+ Specification of the complete set of procedures for the use of
+ OSPF between PE and CE can be found in [VPN-OSPF] and
+ [OSPF-2547-DNBIT].
+
+ 4. The PE and CE routers may be BGP peers, and the CE router may
+ use BGP (in particular, EBGP to tell the PE router the set of
+ address prefixes that are at the CE router's site. (This
+ technique can be used in stub VPNs or transit VPNs.)
+
+ This technique has a number of advantages over the others:
+
+ a) Unlike the IGP alternatives, this does not require the PE
+ to run multiple routing algorithm instances in order to
+ talk to multiple CEs.
+
+ b) BGP is explicitly designed for just this function:
+ passing routing information between systems run by
+ different administrations.
+
+ c) If the site contains "BGP backdoors", i.e., routers with
+ BGP connections to routers other than PE routers, this
+ procedure will work correctly in all circumstances. The
+ other procedures may or may not work, depending on the
+ precise circumstances.
+
+ d) Use of BGP makes it easy for the CE to pass attributes of
+ the routes to the PE. A complete specification of the
+ set of attributes and their use is outside the scope of
+ this document. However, some examples of the way this
+ may be used are the following:
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 28]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ - The CE may suggest a particular Route Target for each
+ route, from among the Route Targets that the PE is
+ authorized to attach to the route. The PE would then
+ attach only the suggested Route Target, rather than
+ the full set. This gives the CE administrator some
+ dynamic control of the distribution of routes from
+ the CE.
+
+ - Additional types of Extended Community attributes may
+ be defined, where the intention is to have those
+ attributes passed transparently (i.e., without being
+ changed by the PE routers) from CE to CE. This would
+ allow CE administrators to implement additional route
+ filtering, beyond that which is done by the PEs.
+ This additional filtering would not require
+ coordination with the SP.
+
+ On the other hand, using BGP may be something new for the CE
+ administrators.
+
+ If a site is not in a transit VPN, note that it need not have a
+ unique Autonomous System Number (ASN). Every CE whose site is
+ not in a transit VPN can use the same ASN. This can be chosen
+ from the private ASN space, and it will be stripped out by the
+ PE. Routing loops are prevented by use of the Site of Origin
+ attribute (see below).
+
+ What if a set of sites constitutes a transit VPN? This will
+ generally be the case only if the VPN is itself an Internet
+ Service Provider's (ISP's) network, where the ISP is itself
+ buying backbone services from another SP. The latter SP may be
+ called a "carrier's carrier". In this case, the best way to
+ provide the VPN is to have the CE routers support MPLS, and to
+ use the technique described in Section 9.
+
+ When we do not need to distinguish among the different ways in which
+ a PE can be informed of the address prefixes that exist at a given
+ site, we will simply say that the PE has "learned" the routes from
+ that site. This includes the case where the PE has been manually
+ configured with the routes.
+
+ Before a PE can redistribute a VPN-IPv4 route learned from a site, it
+ must assign a Route Target attribute (see Section 4.3.1) to the
+ route, and it may assign a Site of Origin attribute to the route.
+
+ The Site of Origin attribute, if used, is encoded as a Route Origin
+ Extended Community [BGP-EXTCOMM]. The purpose of this attribute is
+ to uniquely identify the set of routes learned from a particular
+
+
+
+Rosen & Rekhter Standards Track [Page 29]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ site. This attribute is needed in some cases to ensure that a route
+ learned from a particular site via a particular PE/CE connection is
+ not distributed back to the site through a different PE/CE
+ connection. It is particularly useful if BGP is being used as the
+ PE/CE protocol, but different sites have not been assigned distinct
+ ASNs.
+
+8. How CEs Learn Routes from PEs
+
+ In this section, we assume that the CE device is a router.
+
+ If the PE places a particular route in the VRF it uses to route
+ packets received from a particular CE, then in general, the PE may
+ distribute that route to the CE. Of course, the PE may distribute
+ that route to the CE only if this is permitted by the rules of the
+ PE/CE protocol. (For example, if a particular PE/CE protocol has
+ "split horizon", certain routes in the VRF cannot be redistributed
+ back to the CE.) We add one more restriction on the distribution of
+ routes from PE to CE: if a route's Site of Origin attribute
+ identifies a particular site, that route must never be redistributed
+ to any CE at that site.
+
+ In most cases, however, it will be sufficient for the PE to simply
+ distribute the default route to the CE. (In some cases, it may even
+ be sufficient for the CE to be configured with a default route
+ pointing to the PE.) This will generally work at any site that does
+ not itself need to distribute the default route to other sites.
+ (E.g., if one site in a corporate VPN has the corporation's access to
+ the Internet, that site might need to have default distributed to the
+ other site, but one could not distribute default to that site
+ itself.)
+
+ Whatever procedure is used to distribute routes from CE to PE will
+ also be used to distribute routes from PE to CE.
+
+9. Carriers' Carriers
+
+ Sometimes a VPN may actually be the network of an ISP, with its own
+ peering and routing policies. Sometimes a VPN may be the network of
+ an SP that is offering VPN services in turn to its own customers.
+ VPNs like these can also obtain backbone service from another SP, the
+ "carrier's carrier", using essentially the same methods described in
+ this document. However, it is necessary in these cases that the CE
+ routers support MPLS. In particular:
+
+ - The CE routers should distribute to the PE routers ONLY those
+ routes that are internal to the VPN. This allows the VPN to be
+ handled as a stub VPN.
+
+
+
+Rosen & Rekhter Standards Track [Page 30]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ - The CE routers should support MPLS, in that they should be able
+ to receive labels from the PE routers, and send labeled packets
+ to the PE routers. They do not need to distribute labels of
+ their own, though.
+
+ - The PE routers should distribute, to the CE routers, labels for
+ the routes they distribute to the CE routers.
+
+ The PE must not distribute the same label to two different CEs
+ unless one of the following conditions holds:
+
+ * The two CEs are associated with exactly the same set of VRFs;
+
+ * The PE maintains a different Incoming Label Map ([MPLS-ARCH])
+ for each CE.
+
+ Further, when the PE receives a labeled packet from a CE, it must
+ verify that the top label is one that was distributed to that CE.
+
+ - Routers at the different sites should establish BGP connections
+ among themselves for the purpose of exchanging external routes
+ (i.e., routes that lead outside of the VPN).
+
+ - All the external routes must be known to the CE routers.
+
+ Then when a CE router looks up a packet's destination address, the
+ routing lookup will resolve to an internal address, usually the
+ address of the packet's BGP next hop. The CE labels the packet
+ appropriately and sends the packet to the PE. The PE, rather than
+ looking up the packet's IP destination address in a VRF, uses the
+ packet's top MPLS label to select the BGP next hop. As a result, if
+ the BGP next hop is more than one hop away, the top label will be
+ replaced by two labels, a tunnel label and a VPN route label. If the
+ BGP next hop is one hop away, the top label may be replaced by just
+ the VPN route label. If the ingress PE is also the egress PE, the
+ top label will just be popped. When the packet is sent from its
+ egress PE to a CE, the packet will have one fewer MPLS labels than it
+ had when it was first received by its ingress PE.
+
+ In the above procedure, the CE routers are the only routers in the
+ VPN that need to support MPLS. If, on the other hand, all the
+ routers at a particular VPN site support MPLS, then it is no longer
+ required that the CE routers know all the external routes. All that
+ is required is that the external routes be known to whatever routers
+ are responsible for putting the label stack on a hitherto unlabeled
+ packet and that there be label switched path that leads from those
+ routers to their BGP peers at other sites. In this case, for each
+
+
+
+
+Rosen & Rekhter Standards Track [Page 31]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ internal route that a CE router distributes to a PE router, it must
+ also distribute a label.
+
+10. Multi-AS Backbones
+
+ What if two sites of a VPN are connected to different Autonomous
+ Systems (e.g., because the sites are connected to different SPs)?
+ The PE routers attached to that VPN will then not be able to maintain
+ IBGP connections with each other, or with a common route reflector.
+ Rather, there needs to be some way to use EBGP to distribute VPN-IPv4
+ addresses.
+
+ There are a number of different ways of handling this case, which we
+ present in order of increasing scalability.
+
+ a) VRF-to-VRF connections at the AS (Autonomous System) border
+ routers.
+
+ In this procedure, a PE router in one AS attaches directly to a
+ PE router in another. The two PE routers will be attached by
+ multiple sub-interfaces, at least one for each of the VPNs
+ whose routes need to be passed from AS to AS. Each PE will
+ treat the other as if it were a CE router. That is, the PEs
+ associate each such sub-interface with a VRF, and use EBGP to
+ distribute unlabeled IPv4 addresses to each other.
+
+ This is a procedure that "just works", and that does not
+ require MPLS at the border between ASes. However, it does not
+ scale as well as the other procedures discussed below.
+
+ b) EBGP redistribution of labeled VPN-IPv4 routes from AS to
+ neighboring AS.
+
+ In this procedure, the PE routers use IBGP to redistribute
+ labeled VPN-IPv4 routes either to an Autonomous System Border
+ Router (ASBR), or to a route reflector of which an ASBR is a
+ client. The ASBR then uses EBGP to redistribute those labeled
+ VPN-IPv4 routes to an ASBR in another AS, which in turn
+ distributes them to the PE routers in that AS, or perhaps to
+ another ASBR which in turn distributes them, and so on.
+
+ When using this procedure, VPN-IPv4 routes should only be
+ accepted on EBGP connections at private peering points, as part
+ of a trusted arrangement between SPs. VPN-IPv4 routes should
+ neither be distributed to nor accepted from the public
+ Internet, or from any BGP peers that are not trusted. An ASBR
+ should never accept a labeled packet from an EBGP peer unless
+ it has actually distributed the top label to that peer.
+
+
+
+Rosen & Rekhter Standards Track [Page 32]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ If there are many VPNs having sites attached to different
+ Autonomous Systems, there does not need to be a single ASBR
+ between those two ASes that holds all the routes for all the
+ VPNs; there can be multiple ASBRs, each of which holds only the
+ routes for a particular subset of the VPNs.
+
+ This procedure requires that there be a label switched path
+ leading from a packet's ingress PE to its egress PE. Hence the
+ appropriate trust relationships must exist between and among
+ the set of ASes along the path. Also, there must be agreement
+ among the set of SPs as to which border routers need to receive
+ routes with which Route Targets.
+
+ c) Multi-hop EBGP redistribution of labeled VPN-IPv4 routes
+ between source and destination ASes, with EBGP redistribution
+ of labeled IPv4 routes from AS to neighboring AS.
+
+ In this procedure, VPN-IPv4 routes are neither maintained nor
+ distributed by the ASBRs. An ASBR must maintain labeled IPv4
+ /32 routes to the PE routers within its AS. It uses EBGP to
+ distribute these routes to other ASes. ASBRs in any transit
+ ASes will also have to use EBGP to pass along the labeled /32
+ routes. This results in the creation of a label switched path
+ from the ingress PE router to the egress PE router. Now PE
+ routers in different ASes can establish multi-hop EBGP
+ connections to each other, and can exchange VPN-IPv4 routes
+ over those connections.
+
+ If the /32 routes for the PE routers are made known to the P
+ routers of each AS, everything works normally. If the /32
+ routes for the PE routers are NOT made known to the P routers
+ (other than the ASBRs), then this procedure requires a packet's
+ ingress PE to put a three-label stack on it. The bottom label
+ is assigned by the egress PE, corresponding to the packet's
+ destination address in a particular VRF. The middle label is
+ assigned by the ASBR, corresponding to the /32 route to the
+ egress PE. The top label is assigned by the ingress PE's IGP
+ Next Hop, corresponding to the /32 route to the ASBR.
+
+ To improve scalability, one can have the multi-hop EBGP
+ connections exist only between a route reflector in one AS and
+ a route reflector in another. (However, when the route
+ reflectors distribute routes over this connection, they do not
+ modify the BGP next hop attribute of the routes.) The actual
+ PE routers would then only have IBGP connections to the route
+ reflectors in their own AS.
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 33]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ This procedure is very similar to the "carrier's carrier"
+ procedures described in Section 9. Like the previous
+ procedure, it requires that there be a label switched path
+ leading from a packet's ingress PE to its egress PE.
+
+11. Accessing the Internet from a VPN
+
+ Many VPN sites will need to be able to access the public Internet, as
+ well as to access other VPN sites. The following describes some of
+ the alternative ways of doing this.
+
+ 1. In some VPNs, one or more of the sites will obtain Internet
+ access by means of an "Internet gateway" (perhaps a firewall)
+ attached to a non-VRF interface to an ISP. The ISP may or may
+ not be the same organization as the SP that is providing the
+ VPN service. Traffic to/from the Internet gateway would then
+ be routed according to the PE router's default forwarding
+ table.
+
+ In this case, the sites that have Internet access may be
+ distributing a default route to their PEs, which in turn
+ redistribute it to other PEs and hence into other sites of the
+ VPN. This provides Internet access for all of the VPN's sites.
+
+ In order to properly handle traffic from the Internet, the ISP
+ must distribute, to the Internet, routes leading to addresses
+ that are within the VPN. This is completely independent of any
+ of the route distribution procedures described in this
+ document. The internal structure of the VPN will in general
+ not be visible from the Internet; such routes would simply lead
+ to the non-VRF interface that attaches to the VPN's Internet
+ gateway.
+
+ In this model, there is no exchange of routes between a PE
+ router's default forwarding table and any of its VRFs. VPN
+ route distribution procedures and Internet route distribution
+ procedures are completely independent.
+
+ Note that although some sites of the VPN use a VRF interface to
+ communicate with the Internet, ultimately all packets to/from
+ the Internet traverse a non-VRF interface before
+ leaving/entering the VPN, so we refer to this as "non-VRF
+ Internet access".
+
+ Note that the PE router to which the non-VRF interface attaches
+ does not necessarily need to maintain all the Internet routes
+ in its default forwarding table. The default forwarding table
+ could have as few as one route, "default", which leads to
+
+
+
+Rosen & Rekhter Standards Track [Page 34]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ another router (probably an adjacent one) that has the Internet
+ routes. A variation of this scheme is to tunnel packets
+ received over the non-VRF interface from the PE router to
+ another router, where this other router maintains the full set
+ of Internet routes.
+
+ 2. Some VPNs may obtain Internet access via a VRF interface ("VRF
+ Internet access"). If a packet is received by a PE over a VRF
+ interface, and if the packet's destination address does not
+ match any route in the VRF, then it may be matched against the
+ PE's default forwarding table. If a match is made there, the
+ packet can be forwarded natively through the backbone to the
+ Internet, instead of being forwarded by MPLS.
+
+ In order for traffic to flow natively in the opposite direction
+ (from Internet to VRF interface), some of the routes from the
+ VRF must be exported to the Internet forwarding table.
+ Needless to say, any such routes must correspond to globally
+ unique addresses.
+
+ In this scheme, the default forwarding table might have the
+ full set of Internet routes, or it might have as little as a
+ single default route leading to another router that does have
+ the full set of Internet routes in its default forwarding
+ table.
+
+ 3. Suppose the PE has the capability to store "non-VPN routes" in
+ a VRF. If a packet's destination address matches a "non-VPN
+ route", then the packet is transmitted natively, rather than
+ being transmitted via MPLS. If the VRF contains a non-VPN
+ default route, all packets for the public Internet will match
+ it, and be forwarded natively to the default route's next hop.
+ At that next hop, the packets' destination addresses will be
+ looked up in the default forwarding table, and may match more
+ specific routes.
+
+ This technique would only be available if none of the CE
+ routers is distributing a default route.
+
+ 4. It is also possible to obtain Internet access via a VRF
+ interface by having the VRF contain the Internet routes.
+ Compared with model 2, this eliminates the second lookup, but
+ it has the disadvantage of requiring the Internet routes to be
+ replicated in each such VRF.
+
+ If this technique is used, the SP may want to make its
+ interface to the Internet be a VRF interface, and to use the
+
+
+
+
+Rosen & Rekhter Standards Track [Page 35]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ techniques of Section 4 to distribute Internet routes, as VPN-
+ IPv4 routes, to other VRFs.
+
+ It should be clearly understood that by default, there is no exchange
+ of routes between a VRF and the default forwarding table. This is
+ done ONLY upon agreement between a customer and an SP, and only if it
+ suits the customer's policies.
+
+12. Management VPNs
+
+ This specification does not require that the sub-interface connecting
+ a PE router and a CE router be a "numbered" interface. If it is a
+ numbered interface, this specification allows the addresses assigned
+ to the interface to come from either the address space of the VPN or
+ the address space of the SP.
+
+ If a CE router is being managed by the Service Provider, then the
+ Service Provider will likely have a network management system that
+ needs to be able to communicate with the CE router. In this case,
+ the addresses assigned to the sub-interface connecting the CE and PE
+ routers should come from the SP's address space, and should be unique
+ within that space. The network management system should itself
+ connect to a PE router (more precisely, be at a site that connects to
+ a PE router) via a VRF interface. The address of the network
+ management system will be exported to all VRFs that are associated
+ with interfaces to CE routers that are managed by the SP. The
+ addresses of the CE routers will be exported to the VRF associated
+ with the network management system, but not to any other VRFs.
+
+ This allows communication between the CE and network management
+ system, but does not allow any undesired communication to or among
+ the CE routers.
+
+ One way to ensure that the proper route import/exports are done is to
+ use two Route Targets; call them T1 and T2. If a particular VRF
+ interface attaches to a CE router that is managed by the SP, then
+ that VRF is configured to:
+
+ - import routes that have T1 attached to them, and
+
+ - attach T2 to addresses assigned to each end of its VRF
+ interfaces.
+
+ If a particular VRF interface attaches to the SP's network management
+ system, then that VRF is configured to attach T1 to the address of
+ that system, and to import routes that have T2 attached to them.
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 36]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+13. Security Considerations
+
+13.1. Data Plane
+
+ By security in the "data plane", we mean protection against the
+ following possibilities:
+
+ - Packets from within a VPN travel to a site outside the VPN, other
+ than in a manner consistent with the policies of the VPN.
+
+ - Packets from outside a VPN enter one of the VPN's sites, other
+ than in a manner consistent with the policies of the VPN.
+
+ Under the following conditions:
+
+ 1. a backbone router does not accept labeled packets over a
+ particular data link, unless it is known that that data link
+ attaches only to trusted systems, or unless it is known that
+ such packets will leave the backbone before the IP header or
+ any labels lower in the stack will be inspected, and
+
+ 2. labeled VPN-IPv4 routes are not accepted from untrusted or
+ unreliable routing peers,
+
+ 3. no successful attacks have been mounted on the control plane,
+
+ the data plane security provided by this architecture is virtually
+ identical to that provided to VPNs by Frame Relay or ATM backbones.
+ If the devices under the control of the SP are properly configured,
+ data will not enter or leave a VPN unless authorized to do so.
+
+ Condition 1 above can be stated more precisely. One should discard a
+ labeled packet received from a particular neighbor unless one of the
+ following two conditions holds:
+
+ - the packet's top label has a label value that the receiving
+ system has distributed to that neighbor, or
+
+ - the packet's top label has a label value that the receiving
+ system has distributed to a system beyond that neighbor (i.e.,
+ when it is known that the path from the system to which the label
+ was distributed to the receiving system may be via that
+ neighbor).
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 37]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Condition 2 above is of most interest in the case of inter-provider
+ VPNs (see Section 10). For inter-provider VPNs constructed according
+ to scheme b) of Section 10, condition 2 is easily checked. (The
+ issue of security when scheme (c) of Section 10 is used is for
+ further study.)
+
+ It is worth noting that the use of MPLS makes it much simpler to
+ provide data plane security than might be possible if one attempted
+ to use some form of IP tunneling in place of the MPLS outer label.
+ It is a simple matter to have one's border routers refuse to accept a
+ labeled packet unless the first of the above conditions applies to
+ it. It is rather more difficult to configure a router to refuse to
+ accept an IP packet if that packet is an IP tunneled packet whose
+ destination address is that of a PE router; certainly, this is not
+ impossible to do, but it has both management and performance
+ implications.
+
+ MPLS-in-IP and MPLS-in-GRE tunneling are specified in
+ [MPLS-in-IP-GRE]. If it is desired to use such tunnels to carry VPN
+ packets, then the security considerations described in Section 8 of
+ that document must be fully understood. Any implementation of
+ BGP/MPLS IP VPNs that allows VPN packets to be tunneled as described
+ in that document MUST contain an implementation of IPsec that can be
+ used as therein described. If the tunnel is not secured by IPsec,
+ then the technique of IP address filtering at the border routers,
+ described in Section 8.2 of that document, is the only means of
+ ensuring that a packet that exits the tunnel at a particular egress
+ PE was actually placed in the tunnel by the proper tunnel head node
+ (i.e., that the packet does not have a spoofed source address).
+ Since border routers frequently filter only source addresses, packet
+ filtering may not be effective unless the egress PE can check the IP
+ source address of any tunneled packet it receives, and compare it to
+ a list of IP addresses that are valid tunnel head addresses. Any
+ implementation that allows MPLS-in-IP and/or MPLS-in-GRE tunneling to
+ be used without IPsec MUST allow the egress PE to validate in this
+ manner the IP source address of any tunneled packet that it receives.
+
+ In the case where a number of CE routers attach to a PE router via a
+ LAN interface, to ensure proper security, one of the following
+ conditions must hold:
+
+ 1. All the CE routers on the LAN belong to the same VPN, or
+
+ 2. A trusted and secured LAN switch divides the LAN into multiple
+ VLANs, with each VLAN containing only systems of a single VPN;
+ in this case, the switch will attach the appropriate VLAN tag
+ to any packet before forwarding it to the PE router.
+
+
+
+
+Rosen & Rekhter Standards Track [Page 38]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Cryptographic privacy is not provided by this architecture, nor by
+ Frame Relay or ATM VPNs. These architectures are all compatible with
+ the use of cryptography on a CE-CE basis, if that is desired.
+
+ The use of cryptography on a PE-PE basis is for further study.
+
+13.2. Control Plane
+
+ The data plane security of the previous section depends on the
+ security of the control plane. To ensure security, neither BGP nor
+ LDP connections should be made with untrusted peers. The TCP/IP MD5
+ authentication option [TCP-MD5] should be used with both these
+ protocols. The routing protocol within the SP's network should also
+ be secured in a similar manner.
+
+13.3. Security of P and PE Devices
+
+ If the physical security of these devices is compromised, data plane
+ security may also be compromised.
+
+ The usual steps should be taken to ensure that IP traffic from the
+ public Internet cannot be used to modify the configuration of these
+ devices, or to mount Denial of Service attacks on them.
+
+14. Quality of Service
+
+ Although not the focus of this paper, Quality of Service is a key
+ component of any VPN service. In MPLS/BGP VPNs, existing L3 QoS
+ capabilities can be applied to labeled packets through the use of the
+ "experimental" bits in the shim header [MPLS-ENCAPS], or, where ATM
+ is used as the backbone, through the use of ATM QoS capabilities.
+ The traffic engineering work discussed in [MPLS-RSVP] is also
+ directly applicable to MPLS/BGP VPNs. Traffic engineering could even
+ be used to establish label switched paths with particular QoS
+ characteristics between particular pairs of sites, if that is
+ desirable. Where an MPLS/BGP VPN spans multiple SPs, the
+ architecture described in [PASTE] may be useful. An SP may apply
+ either intserv (Integrated Services) or diffserv (Differentiated
+ Services) capabilities to a particular VPN, as appropriate.
+
+
+
+
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 39]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+15. Scalability
+
+ We have discussed scalability issues throughout this paper. In this
+ section, we briefly summarize the main characteristics of our model
+ with respect to scalability.
+
+ The Service Provider backbone network consists of (a) PE routers, (b)
+ BGP Route Reflectors, (c) P routers (that are neither PE routers nor
+ Route Reflectors), and, in the case of multi-provider VPNs, (d)
+ ASBRs.
+
+ P routers do not maintain any VPN routes. In order to properly
+ forward VPN traffic, the P routers need only maintain routes to the
+ PE routers and the ASBRs. The use of two levels of labeling is what
+ makes it possible to keep the VPN routes out of the P routers.
+
+ A PE router maintains VPN routes, but only for those VPNs to which it
+ is directly attached.
+
+ Route reflectors can be partitioned among VPNs so that each partition
+ carries routes for only a subset of the VPNs supported by the Service
+ Provider. Thus, no single route reflector is required to maintain
+ routes for all VPNs.
+
+ For inter-provider VPNs, if the ASBRs maintain and distribute VPN-
+ IPv4 routes, then the ASBRs can be partitioned among VPNs in a
+ similar manner, with the result that no single ASBR is required to
+ maintain routes for all the inter-provider VPNs. If multi-hop EBGP
+ is used, then the ASBRs need not maintain and distribute VPN-IPv4
+ routes at all.
+
+ As a result, no single component within the Service Provider network
+ has to maintain all the routes for all the VPNs. So the total
+ capacity of the network to support increasing numbers of VPNs is not
+ limited by the capacity of any individual component.
+
+16. IANA Considerations
+
+ The Internet Assigned Numbers Authority (IANA) has created a new
+ registry for the "Route Distinguisher Type Field" (see Section 4.2).
+ This is a two-byte field. Types 0, 1, and 2 are defined by this
+ document. Additional Route Distinguisher Type Field values with a
+ high-order bit of 0 may be allocated by IANA on a "First Come, First
+ Served" basis [IANA]. Values with a high-order bit of 1 may be
+ allocated by IANA based on "IETF consensus" [IANA].
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 40]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ This document specifies (see Section 4.3.4) the use of the BGP
+ Address Family Identifier (AFI) value 1, along with the BGP
+ Subsequent Address Family Identifier (SAFI) value 128, to represent
+ the address family "VPN-IPv4 Labeled Addresses", which is defined in
+ this document.
+
+ The use of AFI value 1 for IP is as currently specified in the IANA
+ registry "Address Family Identifier", so IANA need take no action
+ with respect to it.
+
+ The SAFI value 128 was originally specified as "Private Use" in the
+ IANA "Subsequent Address Family Identifier" registry. IANA has
+ changed the SAFI value 128 from "private use" to "MPLS-labeled VPN
+ address".
+
+17. Acknowledgements
+
+ The full list of contributors can be found in Section 18.
+
+ Significant contributions to this work have also been made by Ravi
+ Chandra, Dan Tappan, and Bob Thomas.
+
+ We also wish to thank Shantam Biswas for his review and
+ contributions.
+
+18. Contributors
+
+ Tony Bogovic
+ Telcordia Technologies
+ 445 South Street, Room 1A264B
+ Morristown, NJ 07960
+
+ EMail: tjb@research.telcordia.com
+
+
+ Stephen John Brannon
+ Swisscom AG
+ Postfach 1570
+ CH-8301
+ Glattzentrum (Zuerich), Switzerland
+
+ EMail: stephen.brannon@swisscom.com
+
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 41]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Marco Carugi
+ Nortel Networks S.A.
+ Parc d'activites de Magny-Les Jeunes Bois CHATEAUFORT
+ 78928 YVELINES Cedex 9 - FRANCE
+
+ EMail: marco.carugi@nortelnetworks.com
+
+
+ Christopher J. Chase
+ AT&T
+ 200 Laurel Ave
+ Middletown, NJ 07748
+ USA
+
+ EMail: chase@att.com
+
+
+ Ting Wo Chung
+ Bell Nexxia
+ 181 Bay Street
+ Suite 350
+ Toronto, Ontario
+ M5J2T3
+
+ EMail: ting_wo.chung@bellnexxia.com
+
+
+ Eric Dean
+
+
+ Jeremy De Clercq
+ Alcatel Network Strategy Group
+ Francis Wellesplein 1
+ 2018 Antwerp, Belgium
+
+ EMail: jeremy.de_clercq@alcatel.be
+
+
+ Luyuan Fang
+ AT&T
+ IP Backbone Architecture
+ 200 Laurel Ave.
+ Middletown, NJ 07748
+
+ EMail: luyuanfang@att.com
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 42]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Paul Hitchen
+ BT
+ BT Adastral Park
+ Martlesham Heath,
+ Ipswich IP5 3RE
+ UK
+
+ EMail: paul.hitchen@bt.com
+
+
+ Manoj Leelanivas
+ Juniper Networks, Inc.
+ 385 Ravendale Drive
+ Mountain View, CA 94043 USA
+
+ EMail: manoj@juniper.net
+
+
+ Dave Marshall
+ Worldcom
+ 901 International Parkway
+ Richardson, Texas 75081
+
+ EMail: dave.marshall@wcom.com
+
+
+ Luca Martini
+ Cisco Systems, Inc.
+ 9155 East Nichols Avenue, Suite 400
+ Englewood, CO, 80112
+
+ EMail: lmartini@cisco.com
+
+
+ Monique Jeanne Morrow
+ Cisco Systems, Inc.
+ Glatt-com, 2nd floor
+ CH-8301
+ Glattzentrum, Switzerland
+
+ EMail: mmorrow@cisco.com
+
+
+
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 43]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ Ravichander Vaidyanathan
+ Telcordia Technologies
+ 445 South Street, Room 1C258B
+ Morristown, NJ 07960
+
+ EMail: vravi@research.telcordia.com
+
+
+ Adrian Smith
+ BT
+ BT Adastral Park
+ Martlesham Heath,
+ Ipswich IP5 3RE
+ UK
+
+ EMail: adrian.ca.smith@bt.com
+
+
+ Vijay Srinivasan
+ 1200 Bridge Parkway
+ Redwood City, CA 94065
+
+ EMail: vsriniva@cosinecom.com
+
+
+ Alain Vedrenne
+ Equant
+ Heraklion, 1041 route des Dolines, BP347
+ 06906 Sophia Antipolis, Cedex, France
+
+ EMail: Alain.Vedrenne@equant.com
+
+19. Normative References
+
+ [BGP] Rekhter, Y. and T. Li, "A Border Gateway Protocol 4
+ (BGP-4)", RFC 4271, January 2006.
+
+ [BGP-MP] Bates, T., Rekhter, Y., Chandra, R., and D. Katz,
+ "Multiprotocol Extensions for BGP-4", RFC 2858,
+ June 2000.
+
+ [BGP-EXTCOMM] Sangli, S., Tappan, D., and Y. Rekhter, "BGP
+ Extended Communities Attribute", RFC 4360, February
+ 2006.
+
+ [MPLS-ARCH] Rosen, E., Viswanathan, A., and R. Callon,
+ "Multiprotocol Label Switching Architecture", RFC
+ 3031, January 2001.
+
+
+
+Rosen & Rekhter Standards Track [Page 44]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ [MPLS-BGP] Rekhter, Y. and E. Rosen, "Carrying Label
+ Information in BGP-4", RFC 3107, May 2001.
+
+ [MPLS-ENCAPS] Rosen, E., Tappan, D., Fedorkow, G., Rekhter, Y.,
+ Farinacci, D., Li, T., and A. Conta, "MPLS Label
+ Stack Encoding", RFC 3032, January 2001.
+
+20. Informative References
+
+ [BGP-AS4] Vohra, Q. and E. Chen, "BGP Support for Four-Octet
+ AS Number Space", Work in Progress, March 2004.
+
+ [BGP-ORF] Chen, E. and Y. Rekhter, "Cooperative Route
+ Filtering Capability for BGP-4", Work in Progress,
+ March 2004.
+
+ [BGP-RFSH] Chen, E., "Route Refresh Capability for BGP-4", RFC
+ 2918, September 2000.
+
+ [BGP-RR] Bates, T., Chandra, R., and E. Chen, "BGP Route
+ Reflection - An Alternative to Full Mesh IBGP", RFC
+ 2796, April 2000.
+
+ [IANA] Narten, T. and H. Alvestrand, "Guidelines for
+ Writing an IANA Considerations Section in RFCs",
+ BCP 26, RFC 2434, October 1998.
+
+ [MPLS-ATM] Davie, B., Lawrence, J., McCloghrie, K., Rosen, E.,
+ Swallow, G., Rekhter, Y., and P. Doolan, "MPLS
+ using LDP and ATM VC Switching", RFC 3035, January
+ 2001.
+
+ [MPLS/BGP-IPsec] Rosen, E., De Clercq, J., Paridaens, O., T'Joens,
+ Y., and C. Sargor, "Architecture for the Use of
+ PE-PE IPsec Tunnels in BGP/MPLS IP VPNs", Work in
+ Progress, March 2004.
+
+ [MPLS-FR] Conta, A., Doolan, P., and A. Malis, "Use of Label
+ Switching on Frame Relay Networks Specification",
+ RFC 3034, January 2001.
+
+ [MPLS-in-IP-GRE] Worster, T., Rekhter, Y., and E. Rosen,
+ "Encapsulating MPLS in IP or Generic Routing
+ Encapsulation (GRE)", RFC 4023, March 2005.
+
+ [MPLS-LDP] Andersson, L., Doolan, P., Feldman, N., Fredette,
+ A., and B. Thomas, "LDP Specification", RFC 3036,
+ January 2001.
+
+
+
+Rosen & Rekhter Standards Track [Page 45]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+ [MPLS-RSVP] Awduche, D., Berger, L., Gan, D., Li, T.,
+ Srinivasan, V., and G. Swallow, "RSVP-TE:
+ Extensions to RSVP for LSP Tunnels", RFC 3209,
+ December 2001.
+
+ [OSPFv2] Moy, J., "OSPF Version 2", STD 54, RFC 2328, April
+ 1998.
+
+ [PASTE] Li, T. and Y. Rekhter, "A Provider Architecture for
+ Differentiated Services and Traffic Engineering
+ (PASTE)", RFC 2430, October 1998.
+
+ [RIP] Malkin, G., "RIP Version 2", STD 56, RFC 2453,
+ November 1998.
+
+ [OSPF-2547-DNBIT] Rosen, E., Psenak, P., and P. Pillay-Esnault,
+ "Using an LSA Options Bit to Prevent Looping in
+ BGP/MPLS IP VPNs", Work in Progress, March 2004.
+
+ [TCP-MD5] Heffernan, A., "Protection of BGP Sessions via the
+ TCP MD5 Signature Option", RFC 2385, August 1998.
+
+ [VPN-MCAST] Rosen, E., Cai, Y., and J. Wijsnands, "Multicast in
+ MPLS/BGP VPNs", Work in Progress, May 2004.
+
+ [VPN-OSPF] Rosen, E., Psenak, P., and P. Pillay-Esnault, "OSPF
+ as the PE/CE Protocol in BGP/MPLS VPNs", Work in
+ Progress, February 2004.
+
+Authors' Addresses
+
+ Eric C. Rosen
+ Cisco Systems, Inc.
+ 1414 Massachusetts Avenue
+ Boxborough, MA 01719
+
+ EMail: erosen@cisco.com
+
+
+ Yakov Rekhter
+ Juniper Networks
+ 1194 N. Mathilda Avenue
+ Sunnyvale, CA 94089
+
+ EMail: yakov@juniper.net
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 46]
+
+RFC 4364 BGP/MPLS IP VPNs February 2006
+
+
+Full Copyright Statement
+
+ Copyright (C) The Internet Society (2006).
+
+ This document is subject to the rights, licenses and restrictions
+ contained in BCP 78, and except as set forth therein, the authors
+ retain all their rights.
+
+ This document and the information contained herein are provided on an
+ "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE REPRESENTS
+ OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE INTERNET
+ ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED,
+ INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE
+ INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED
+ WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE.
+
+Intellectual Property
+
+ The IETF takes no position regarding the validity or scope of any
+ Intellectual Property Rights or other rights that might be claimed to
+ pertain to the implementation or use of the technology described in
+ this document or the extent to which any license under such rights
+ might or might not be available; nor does it represent that it has
+ made any independent effort to identify any such rights. Information
+ on the procedures with respect to rights in RFC documents can be
+ found in BCP 78 and BCP 79.
+
+ Copies of IPR disclosures made to the IETF Secretariat and any
+ assurances of licenses to be made available, or the result of an
+ attempt made to obtain a general license or permission for the use of
+ such proprietary rights by implementers or users of this
+ specification can be obtained from the IETF on-line IPR repository at
+ http://www.ietf.org/ipr.
+
+ The IETF invites any interested party to bring to its attention any
+ copyrights, patents or patent applications, or other proprietary
+ rights that may cover technology that may be required to implement
+ this standard. Please address the information to the IETF at
+ ietf-ipr@ietf.org.
+
+Acknowledgement
+
+ Funding for the RFC Editor function is provided by the IETF
+ Administrative Support Activity (IASA).
+
+
+
+
+
+
+
+Rosen & Rekhter Standards Track [Page 47]
+
diff --git a/monitor/monitor_backend.go b/monitor/monitor_backend.go
@@ -0,0 +1,22 @@
+package monitor
+
+import (
+ //"log"
+ "github.com/gocql/gocql"
+)
+
+type CassandraContext struct {
+ cconfig *gocql.ClusterConfig
+ session *gocql.Session
+}
+
+func (c *CassandraContext) StartCassandra(kspace string, addrs ...string) (err error) {
+ c.cconfig = gocql.NewCluster(addrs...)
+ c.cconfig.Keyspace = kspace
+ c.session, err = c.cconfig.CreateSession()
+ return
+}
+
+func (c *CassandraContext) StopCassandra() {
+ c.session.Close()
+}
diff --git a/monitor/monitor_backend_test.go b/monitor/monitor_backend_test.go
@@ -0,0 +1,32 @@
+package monitor
+
+import (
+ "testing"
+)
+
+var (
+ conerr error
+)
+
+func TestStartStop(t *testing.T) {
+ c := &CassandraContext{}
+ conerr = c.StartCassandra("bgp_mongol_test", "worf.netsec.colostate.edu")
+ if conerr != nil {
+ t.Logf("could not conect to test cassandra instance at worf.netsec.colostate.edu. not failing test")
+ return
+ }
+ c.StopCassandra()
+}
+
+func TestCreateTable(t *testing.T) {
+ c := &CassandraContext{}
+ conerr = c.StartCassandra("bgp_mongol_test", "worf.netsec.colostate.edu")
+ if conerr != nil {
+ t.Logf("could not conect to test cassandra instance at worf.netsec.colostate.edu. not failing test")
+ return
+ }
+ if err := c.session.Query("CREATE TABLE bmf ( prefix text PRIMARY KEY, dat1 text, dat2 text);").Exec(); err != nil {
+ t.Logf(" create table returned error :%v \n", err)
+ }
+ c.StopCassandra()
+}
diff --git a/mrt/mrt.go b/mrt/mrt.go
@@ -0,0 +1,377 @@
+package mrt
+
+import (
+ "bytes"
+ "encoding/binary"
+ "errors"
+ "fmt"
+ "io"
+ "log"
+ "net"
+ "os"
+ "unicode/utf8"
+ //"runtime"
+)
+
+var logger = log.New(os.Stderr, "go-mrt: ", log.Ldate|log.Llongfile)
+
+type MrtHdr struct {
+ Mrt_timestamp uint32
+ Mrt_type uint16
+ Mrt_subtype uint16
+ Mrt_len uint32
+}
+
+type parsefunc func([]byte) MrtSubTyper
+
+type MrtMsg struct {
+ Hdr MrtHdr
+ BGPMsg []byte
+}
+
+type MrtSubTyper interface {
+ Type() string //almost dummy functionality
+ String() string
+}
+
+type MrtOSPFHdr struct {
+ otype uint16
+ RemoteIP uint32
+ LocalIP uint32
+}
+
+func (m *MrtOSPFHdr) Type() string {
+ return "OSPFHdr"
+}
+
+func (m *MrtOSPFHdr) String() string {
+ remip := make(net.IP, 4)
+ locip := make(net.IP, 4)
+ remip[0] = byte(m.RemoteIP)
+ remip[1] = byte(m.RemoteIP >> 8)
+ remip[2] = byte(m.RemoteIP >> 16)
+ remip[3] = byte(m.RemoteIP >> 24)
+ locip[0] = byte(m.LocalIP)
+ locip[1] = byte(m.LocalIP >> 8)
+ locip[2] = byte(m.LocalIP >> 16)
+ locip[3] = byte(m.LocalIP >> 24)
+ return fmt.Sprintf("OSPF Header. Type [%d] Remote IP [%s] Local IP [%s]", m.otype, remip, locip)
+}
+
+type MrtInfoMsg struct {
+ inftype uint16
+ optmsg string
+}
+
+func (m *MrtInfoMsg) String() string {
+ return fmt.Sprintf("Informational Message. Type [%v] Optstring [%s]", m.inftype, m.optmsg)
+}
+
+func (m *MrtInfoMsg) Type() string {
+ return m.String()
+}
+
+func (m *MrtMsg) PFunc() (ret parsefunc, ok bool) {
+ var subtype = m.Hdr.Mrt_subtype
+ var mtype = m.Hdr.Mrt_type
+ infofunc := func(a []byte) MrtSubTyper {
+ runes := []rune{}
+ infomsg := &MrtInfoMsg{inftype: mtype, optmsg: "No Optional Message"}
+ for len(a) > 0 {
+ r, sz := utf8.DecodeRune(a)
+ if r == utf8.RuneError {
+ logger.Println("failed to decode rune in optional message")
+ return infomsg
+ }
+ a = a[sz:]
+ runes = append(runes, r)
+ }
+ if len(runes) > 0 {
+ infomsg.optmsg = string(runes)
+ }
+ return infomsg
+ }
+
+ ospffunc := func(a []byte) MrtSubTyper {
+ ret := &MrtOSPFHdr{otype: subtype}
+ buf := bytes.NewReader(a)
+ err := binary.Read(buf, binary.BigEndian, &ret.RemoteIP)
+ err = binary.Read(buf, binary.BigEndian, &ret.LocalIP)
+ if err != nil {
+ panic(fmt.Sprintf("error while reading binary OSPF header: %s", err))
+ }
+ return ret
+ }
+
+ bgp4mpscfunc := func(a []byte) MrtSubTyper {
+ ret := &MrtBGP4MPStateChangeHdr{}
+ buf := bytes.NewReader(a)
+ if subtype == BGP4MP_STATE_CHANGE {
+ ret.PeerASN = make([]byte, 2)
+ ret.LocalASN = make([]byte, 2)
+ } else {
+ ret.PeerASN = make([]byte, 4)
+ ret.LocalASN = make([]byte, 4)
+ }
+ err := binary.Read(buf, binary.BigEndian, &ret.PeerASN)
+ if err != nil {
+ panic(fmt.Sprintf("error while reading binary BGP4MP header: %s", err))
+ }
+ binary.Read(buf, binary.BigEndian, &ret.LocalASN)
+ binary.Read(buf, binary.BigEndian, &ret.InterfaceInd)
+ binary.Read(buf, binary.BigEndian, &ret.AddrFamily)
+ if ret.AddrFamily == 1 {
+ ret.PeerIP = make([]byte, 4)
+ ret.LocalIP = make([]byte, 4)
+ } else if ret.AddrFamily == 2 {
+ ret.PeerIP = make([]byte, 16)
+ ret.LocalIP = make([]byte, 16)
+ }
+ binary.Read(buf, binary.BigEndian, &ret.PeerIP)
+ binary.Read(buf, binary.BigEndian, &ret.LocalIP)
+ binary.Read(buf, binary.BigEndian, &ret.OldState)
+ binary.Read(buf, binary.BigEndian, &ret.NewState)
+ return ret
+ }
+
+ bgp4mpmsgfunc := func(a []byte) MrtSubTyper {
+ ret := &MrtBGP4MPMsgHdr{}
+ buf := bytes.NewReader(a)
+ if subtype == BGP4MP_MESSAGE {
+ ret.PeerASN = make([]byte, 2)
+ ret.LocalASN = make([]byte, 2)
+ } else if subtype == BGP4MP_MESSAGE_AS4 {
+ ret.PeerASN = make([]byte, 4)
+ ret.LocalASN = make([]byte, 4)
+ }
+ err := binary.Read(buf, binary.BigEndian, &ret.PeerASN)
+ if err != nil {
+ panic(fmt.Sprintf("error while reading binary BGP4MP header: %s", err))
+ }
+ binary.Read(buf, binary.BigEndian, &ret.LocalASN)
+ binary.Read(buf, binary.BigEndian, &ret.InterfaceInd)
+ //fmt.Printf("ADdr family should be:%v\n", binary.BigEndian.Uint16(a[6:8]))
+ binary.Read(buf, binary.BigEndian, &ret.AddrFamily)
+ if ret.AddrFamily == 1 {
+ ret.PeerIP = make([]byte, 4)
+ ret.LocalIP = make([]byte, 4)
+ } else if ret.AddrFamily == 2 {
+ ret.PeerIP = make([]byte, 16)
+ ret.LocalIP = make([]byte, 16)
+ } else {
+ panic("Address Family in BGP4MP msg func is wrong")
+ }
+ binary.Read(buf, binary.BigEndian, &ret.PeerIP)
+ binary.Read(buf, binary.BigEndian, &ret.LocalIP)
+ return ret
+ }
+
+ ret = nil
+ ok = false
+ switch mtype {
+ case MSG_PROTOCOL_BGP4MP:
+ if subtype == BGP4MP_STATE_CHANGE || subtype == BGP4MP_STATE_CHANGE_AS4 {
+ ret, ok = bgp4mpscfunc, true
+ } else if subtype == BGP4MP_MESSAGE || subtype == BGP4MP_MESSAGE_AS4 ||
+ subtype == BGP4MP_MESSAGE_LOCAL || subtype == BGP4MP_MESSAGE_AS4_LOCAL {
+ ret, ok = bgp4mpmsgfunc, true
+ }
+ case MSG_START, MSG_I_AM_DEAD:
+ if subtype == 0 {
+ ret, ok = infofunc, true
+ } else {
+ logger.Println("Mrt type is Informational but Subtype non-zero")
+ }
+ case MSG_PROTOCOL_OSPF:
+ if subtype == 0 || subtype == 1 {
+ ret, ok = ospffunc, true
+ } else {
+ logger.Println("Mrt type is OSPF but Subtype is neither 0 or 1")
+ }
+ case MSG_NULL, MSG_DIE, MSG_PEER_DOWN, MSG_PROTOCOL_BGP, MSG_PROTOCOL_IDRP, MSG_PROTOCOL_BGP4PLUS, MSG_PROTOCOL_BGP4PLUS1:
+ logger.Println("Deprecated message type")
+ default:
+ logger.Printf("unknown. header [%v]\n", m.Hdr)
+ }
+ return
+}
+
+type MrtBGP4MPStateChangeHdr struct {
+ PeerASN []byte
+ LocalASN []byte
+ InterfaceInd uint16
+ AddrFamily uint16
+ PeerIP []byte
+ LocalIP []byte
+ OldState uint16
+ NewState uint16
+}
+
+func (m *MrtBGP4MPStateChangeHdr) Type() string {
+ return "BGP4MPStateChange"
+}
+
+func (m *MrtBGP4MPStateChangeHdr) String() string {
+ return "BGP4MPStateChange"
+}
+
+type MrtBGP4MPMsgHdr struct {
+ PeerASN []byte
+ LocalASN []byte
+ InterfaceInd uint16
+ AddrFamily uint16
+ PeerIP []byte
+ LocalIP []byte
+}
+
+func (m *MrtBGP4MPMsgHdr) Type() string {
+ return "BGP4MPMsg"
+}
+
+func (m *MrtBGP4MPMsgHdr) String() string {
+ if len(m.PeerIP) < 4 || len(m.LocalIP) < 4 {
+ return "BGP4MPMsg unable to read IPs"
+ }
+ return fmt.Sprintf("LocalIP:%s RemoteIP:%s", net.IPv4(m.PeerIP[0], m.PeerIP[1], m.PeerIP[2], m.PeerIP[3]), net.IPv4(m.LocalIP[0], m.LocalIP[1], m.LocalIP[2], m.LocalIP[3]))
+}
+
+type MrtTableDumpV1Hdr struct {
+ ViewNum uint16
+ SeqNum uint16
+ Prefix []byte
+ PrefixLen uint8
+ Status uint8
+ OrigTime uint32
+ PeerIP []byte
+ PeerAS uint16
+ AttrLen uint16
+}
+
+func (m *MrtTableDumpV1Hdr) Type() string {
+ return "TableDumpV1Hdr"
+}
+
+func (m *MrtTableDumpV1Hdr) String() string {
+ return "TableDumpV1Hdr"
+}
+
+type MrtFile struct {
+ file io.Reader
+ entries uint32
+ off int64
+}
+
+const (
+ MrtHdr_size = 12
+ dump_size = 10000
+)
+
+// mrt-type consts
+const (
+ MSG_NULL = iota // 0 empty msg (deprecated)
+ MSG_START // 1 sender is starting up
+ MSG_DIE // 2 receiver should shut down (deprecated)
+ MSG_I_AM_DEAD // 3 sender is shutting down
+ MSG_PEER_DOWN // 4 sender's peer is down (deprecated)
+ MSG_PROTOCOL_BGP // 5 msg is a BGP packet (deprecated)
+ MSG_PROTOCOL_RIP // 6 msg is a RIP packet
+ MSG_PROTOCOL_IDRP // 7 msg is an IDRP packet (deprecated)
+ MSG_PROTOCOL_RIPNG // 8 msg is a RIPNG packet
+ MSG_PROTOCOL_BGP4PLUS // 9 msg is a BGP4+ packet (deprecated)
+ MSG_PROTOCOL_BGP4PLUS1 // 10 msg is a BGP4+ (draft 01) (deprecated)
+ MSG_PROTOCOL_OSPF // 11 msg is an OSPF packet
+ MSG_TABLE_DUMP // 12 routing table dump
+ MSG_TABLE_DUMP_V2 // 13 routing table dump
+ MSG_PROTOCOL_BGP4MP = 16 // 16 zebras own packet format
+ MSG_PROTOCOL_BGP4MP_ET = 17
+ MSG_PROTOCOL_ISIS = 32 // 32 msg is a ISIS package
+ MSG_PROTOCOL_ISIS_ET = 33
+ MSG_PROTOCOL_OSPFV3 = 48 // 48 msg is a OSPFv3 package
+ MSG_PROTOCOL_OSPFV3_ET = 49
+)
+
+// mrt-subtype consts
+const (
+ BGP4MP_STATE_CHANGE = 0 // state change
+ BGP4MP_MESSAGE = 1 // bgp message
+ BGP4MP_MESSAGE_AS4 = 4 // same as BGP4MP_MESSAGE with 4byte AS
+ BGP4MP_STATE_CHANGE_AS4 = 5
+ BGP4MP_MESSAGE_LOCAL = 6 // same as BGP4MP_MESSAGE but for self
+ BGP4MP_MESSAGE_AS4_LOCAL = 7 // originated updates. Not implemented
+)
+
+const (
+ OSPF_STATE_CHANGE = iota
+ OSPF_LSA_UPDATE
+)
+
+func NewMrtHdr(b []byte) (ret MrtHdr, err error) {
+ buf := bytes.NewReader(b)
+ err = binary.Read(buf, binary.BigEndian, &ret)
+ return
+}
+
+func NewMrtFile(f io.Reader) (ret MrtFile) {
+ ret = MrtFile{f, 0, 0}
+ return
+}
+
+//This function can be passed into a bufio.Scanner.Split() to read buffered
+//mrt msgs
+func SplitMrt(data []byte, atEOF bool) (advance int, token []byte, err error) {
+ if atEOF && len(data) == 0 {
+ return 0, nil, nil
+ }
+ if cap(data) < MrtHdr_size { // read more
+ return 0, nil, nil
+ }
+ //this reads the data and (they are big endian so it handles that)
+ hdr, errh := NewMrtHdr(data[:MrtHdr_size])
+ if errh != nil {
+ return 0, nil, errh
+ }
+ totlen := int(hdr.Mrt_len + MrtHdr_size)
+ if len(data) < totlen { //need to read more
+ return 0, nil, nil
+ }
+ //logger.Printf("scanned mrt with len:%d datalen is :%d", totlen, len(data))
+ return totlen, data[0:totlen], nil
+}
+
+func (f *MrtFile) Read(b []byte) (n int, err error) {
+ //fmt.Printf(" b len:%v cap:%v\n",len(b), cap(b))
+ if cap(b) < MrtHdr_size {
+ err = errors.New("buffer size less than header size")
+ return
+ }
+ n, err = f.file.Read(b[:MrtHdr_size])
+ if err != nil {
+ return
+ }
+ hdr, errh := NewMrtHdr(b[:MrtHdr_size])
+ if errh != nil {
+ err = errors.New(fmt.Sprintf("error in reading header from offset %v : %s", f.off, errh))
+ return
+ }
+ //fmt.Printf("got header at offset:%d ! :%v\n", f.off, hdr)
+ //n = int(hdr.Mrt_len+MrtHdr_size)
+ //f.off = f.off + int64(n)
+ f.entries = f.entries + 1
+ //this will just jump over the msg
+ //noff,errs := f.file.Seek(int64(hdr.Mrt_len), os.SEEK_CUR)
+ if dump_size-(hdr.Mrt_len+MrtHdr_size) <= 0 {
+ err = errors.New(fmt.Sprintf("bgp message of size:%v at offset is too large", hdr.Mrt_len, f.off+MrtHdr_size))
+ return
+ }
+ //fmt.Printf("i will access b[%v:%v] len:%v cap:%v\n",MrtHdr_size, hdr.Mrt_len+MrtHdr_size, len(b), cap(b))
+ nr, err := f.file.Read(b[MrtHdr_size : hdr.Mrt_len+MrtHdr_size])
+ if nr != int(hdr.Mrt_len) {
+ n = n + nr //header + len of read
+ err = errors.New(fmt.Sprintf("error in reading bgp message of size :%v . got :%v bytes.", hdr.Mrt_len, n))
+ return
+ }
+ n = n + nr
+ f.off += int64(n)
+ //fmt.Printf("seeked at offset:%d \n", f.off)
+ return
+}
diff --git a/mrt/mrt_test.go b/mrt/mrt_test.go
@@ -0,0 +1,118 @@
+package mrt
+
+import (
+ "bufio"
+ "bytes"
+ "encoding/binary"
+ "fmt"
+ "net"
+ "os"
+ "testing"
+)
+
+func TestMrtHdr(t *testing.T) {
+ buf := new(bytes.Buffer)
+ var tdate, tlen uint32 = 1, 4
+ var ttype, tsubtype uint16 = 2, 3
+ mrt := &MrtHdr{tdate, ttype, tsubtype, tlen}
+ fmt.Printf("date:%v type:%v subtype:%v len:%v\n", tdate, ttype, tsubtype, tlen)
+ binary.Write(buf, binary.BigEndian, mrt)
+ fmt.Printf("binary mrt: %x\n", buf.Bytes())
+ mhdr, err := NewMrtHdr(buf.Bytes())
+ if err != nil {
+ t.Fatal(err)
+ }
+ fmt.Printf("recreating MrtHdr from binary :%+v \n", mhdr)
+}
+
+func TestMrtPFunc(t *testing.T) {
+ var (
+ tt1, ts1 = uint16(1), uint16(0) //start
+ tt2, ts2 = uint16(3), uint16(1) //i am dead , but wrong subtype
+ tt3, ts3 = uint16(2), uint16(0) //deprecated
+ tt4, ts4 = uint16(11), uint16(0) //ospf state change
+ tbuf = []byte{0, 0, 0, 0, 0, 0, 0, 0}
+ tf parsefunc
+ ok bool
+ )
+ //binbuf := new(bytes.Buffer)
+ mrt1 := &MrtMsg{
+ Hdr: MrtHdr{1, tt1, ts1, 10},
+ BGPMsg: tbuf,
+ }
+ mrt2 := &MrtMsg{
+ Hdr: MrtHdr{1, tt2, ts2, 10},
+ BGPMsg: tbuf,
+ }
+ mrt3 := &MrtMsg{
+ Hdr: MrtHdr{1, tt3, ts3, 10},
+ BGPMsg: tbuf,
+ }
+ mrt4 := &MrtMsg{
+ Hdr: MrtHdr{1, tt4, ts4, 10},
+ BGPMsg: tbuf,
+ }
+ fmt.Println("trying to parse informational message")
+ if tf, ok = mrt1.PFunc(); !ok {
+ t.Fatal("tf should be non nil")
+ }
+ hdr := tf(mrt1.BGPMsg)
+ fmt.Printf("type is :%s\n", hdr.Type())
+ fmt.Println("trying to parse informational message with opt string")
+ mrt1.BGPMsg = []byte{'f', 'o', 'o', ' ', 's', 't', 'r'}
+ mrt1.Hdr.Mrt_type = tt2
+ if tf, ok = mrt1.PFunc(); !ok {
+ t.Fatal("tf should be non nil")
+ }
+ hdr = tf(mrt1.BGPMsg)
+ fmt.Printf("type is :%s\n", hdr.Type())
+ fmt.Println("trying to parse malformed informational message")
+ if tf, ok = mrt2.PFunc(); ok {
+ t.Fatal("this should fail with tf being nil cause subtype is non-0")
+ }
+ fmt.Println("trying to parse deprecated message")
+ if tf, ok = mrt3.PFunc(); ok {
+ t.Fatal("this should fail with tf being nil cause it's deprecated")
+ }
+ fmt.Println("trying to parse OSPF message")
+ //first call to littleendian to come to hostbyteorder and then switch to big
+ binary.BigEndian.PutUint32(mrt4.BGPMsg[:4], binary.LittleEndian.Uint32(net.IPv4(1, 2, 3, 4).To4()))
+ binary.BigEndian.PutUint32(mrt4.BGPMsg[4:], binary.LittleEndian.Uint32(net.IPv4(5, 6, 7, 8).To4()))
+ //binary.Write(binbuf, binary.BigEndian, net.IPv4allsys.To4())
+ //mrt4.BGPMsg = make([]byte,8)
+ //mrt4.BGPMsg = binbuf.Bytes()
+ //copy(mrt4.BGPMsg,binbuf.Bytes())
+ if tf, ok = mrt4.PFunc(); !ok {
+ t.Fatal("this shouldn't fail")
+ }
+ hdr = tf(mrt4.BGPMsg)
+ fmt.Printf("type is :%s .String representation: %s\n", hdr.Type(), hdr)
+}
+
+func TestScan(t *testing.T) {
+ fmt.Println("testing the scanner interface")
+ f, err := os.Open("../tests/mrt3")
+ if err != nil {
+ t.Fatal(err)
+ }
+ mrtscanner := bufio.NewScanner(f)
+ mrtscanner.Split(SplitMrt)
+ count := 0
+ for mrtscanner.Scan() {
+ count++
+ dat := mrtscanner.Bytes()
+ h, _ := NewMrtHdr(dat[:MrtHdr_size]) /* the error has been checked in Read() */
+ if h.Mrt_len == 0 {
+ t.Logf("terminating from 0 mrt len")
+ return
+ }
+ mrtmsg := MrtMsg{Hdr: h, BGPMsg: dat[MrtHdr_size:]}
+ if tf, ok := mrtmsg.PFunc(); ok {
+ tf(mrtmsg.BGPMsg)
+ }
+ }
+ if err := mrtscanner.Err(); err != nil {
+ fmt.Printf("error: %s", err)
+ }
+ fmt.Printf("scanned and parsed: %d entries from bufio\n", count)
+}
diff --git a/tests/mrt1 b/tests/mrt1
Binary files differ.
diff --git a/tests/mrt2 b/tests/mrt2
Binary files differ.
diff --git a/tests/mrt3 b/tests/mrt3
Binary files differ.