go-bgp

a collection of golang BGP tools to monitor, archive and serve
git clone git://git.2f30.org/go-bgp.git
Log | Files | Refs | README

commit f36b04125906b8f654a46650d83d7e17caa1e8d5
Author: dsp <dsp@2f30.org>
Date:   Tue Feb 10 19:15:16 -0700

initial commit

Diffstat:
Makefile | 12++++++++++++
README | 60++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
archive/archive.go | 607+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
bgp.go | 1+
cmd/archive_server.go | 51+++++++++++++++++++++++++++++++++++++++++++++++++++
doc/draft-ietf-grow-mrt-11.txt | 1625+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
doc/rfc1771.txt | 3195+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
doc/rfc4360.txt | 675+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
doc/rfc4364.txt | 2635+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
monitor/monitor_backend.go | 22++++++++++++++++++++++
monitor/monitor_backend_test.go | 32++++++++++++++++++++++++++++++++
mrt/mrt.go | 377+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
mrt/mrt_test.go | 118+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
tests/mrt1 | 0
tests/mrt2 | 0
tests/mrt3 | 0
16 files changed, 9410 insertions(+), 0 deletions(-)
diff --git a/Makefile b/Makefile @@ -0,0 +1,12 @@ +all: allbin + +test: + go test ./... + +allbin: cmd/archive_server.go + go build cmd/archive_server.go + +clean: + rm -f archive_server + go clean + diff --git a/README b/README @@ -0,0 +1,60 @@ +===================================================================== + go-bgp by DsP <dsp@2f30.org> +===================================================================== +[General] +go-bgp is a collection of pure golang libraries and tools for: +reading and writing MRT files +parsing BGP messages +exposing archived BGP messages in various formats over RESTful HTTP/2 + +[Details] +mrt/ + Using this module from golang allows you to open a file and then using + bufio to set the splitfunc to the provided SplitMrt + then using .Scan() you can get the []bytes of each MRT message + + +archive/ + This module allows the scanning of hierarchical dated dirs + that contain either XML encoded files or MRT files. + then it exposes an API that allows requests in the form of + http://host:port/archive?start=YYYYMMDDHHMMSS&end=YYYYMMDDHHMMSS&type=mrt + or + http://host:port/archive/conf?range + http://host:port/archive/conf?files + +monitor/ + bgp monitor + +tests/ + contain data for unit tests + +doc/ + relevant RFCs + +cmd/ + executable programs + +[License] +Copyright (c) 2015, dsp <dsp@2f30.org> +All rights reserved. + +Redistribution and use in source and binary forms, with or without modification, +are permitted provided that the following conditions are met: + +1. Redistributions of source code must retain the above copyright notice, +this list of conditions and the following disclaimer. + +2. Redistributions in binary form must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES +OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. +IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, +OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/archive/archive.go b/archive/archive.go @@ -0,0 +1,607 @@ +package archive + +import ( + "errors" + "fmt" + "log" + "net/http" + "net/url" + //"io/ioutil" + //"bytes" + "bufio" + "compress/bzip2" + mrt "go-bgp/mrt" + "io" + "os" + "path/filepath" + "sort" + "strconv" + "strings" + "sync" + "time" + "unicode" +) + +const ( + GET = "GET" + PUT = "PUT" + POST = "POST" + DELETE = "DELETE" +) + +var ( + errbadreq = errors.New("malformed request") + errbaddate = errors.New("dates should be in a YYYYMMDDHHMM format and start should be earlier than end") + errempty = errors.New("archive empty") + errdate = errors.New("no such date in archive") +) + +type Resource interface { + Get(url.Values) (int, chan reply) + Put(url.Values) (int, chan reply) + Post(url.Values) (int, chan reply) + Delete(url.Values) (int, chan reply) +} + +type ( + GetNotAllowed struct{} + PutNotAllowed struct{} + PostNotAllowed struct{} + DeleteNotAllowed struct{} +) + +func (GetNotAllowed) Get(vals url.Values) (int, chan reply) { + return 405, nil +} + +func (PutNotAllowed) Put(vals url.Values) (int, chan reply) { + return 405, nil +} +func (PostNotAllowed) Post(vals url.Values) (int, chan reply) { + return 405, nil +} +func (DeleteNotAllowed) Delete(vals url.Values) (int, chan reply) { + return 405, nil +} + +type API struct{} + +func (api *API) requestHandlerFunc(resource Resource) http.HandlerFunc { + return func(rw http.ResponseWriter, req *http.Request) { + var ( + datac chan reply + code int + ) + req.ParseForm() + method := req.Method + vals := req.Form + switch method { + case GET: + code, datac = resource.Get(vals) + case PUT: + code, datac = resource.Put(vals) + case POST: + code, datac = resource.Post(vals) + case DELETE: + code, datac = resource.Delete(vals) + } + rw.WriteHeader(code) + if datac != nil { // we got a proper channel to get datafrom + //go func(dc <-chan reply) { // fire a goroutine that will end upon the chan getting closed + for r := range datac { + if r.err == nil { + rw.Write(r.data) + } else { + log.Printf("Error in received from data channel:%s\n", r.err) + rw.Write([]byte(fmt.Sprintf("%s\n", r.err))) + } + } + //}(datac) + } + } +} + +func (api *API) AddResource(resource Resource, path string) { + http.HandleFunc(path, api.requestHandlerFunc(resource)) +} + +func (api *API) Start(port int) { + portstr := fmt.Sprintf(":%d", port) + http.ListenAndServe(portstr, nil) +} + +type reply struct { + data []byte + err error +} + +//To perform a query asynchronously on possibly many files we fire multiple goroutines +//that all write their results to chan reply, and we also need the waitgroup +//to know when we should close the channel to end the http transaction +type archive interface { + Query(time.Time, time.Time, chan reply, *sync.WaitGroup) + visit(string, os.FileInfo, error) error +} + +type xmlstring struct { + timestr string + msg string + time time.Time +} + +func (x *xmlstring) String() string { + return x.msg +} + +//implements Sort interface by time.Time +type archentryfile struct { + path string + sdate time.Time + sz int64 +} + +type timeentryslice []archentryfile + +func (p timeentryslice) Len() int { + return len(p) +} + +func (p timeentryslice) Less(i, j int) bool { + return p[i].sdate.Before(p[j].sdate) +} + +func (p timeentryslice) Swap(i, j int) { + p[i], p[j] = p[j], p[i] +} + +type fsarchive struct { + rootpathstr string + entryfiles *timeentryslice + tempentryfiles timeentryslice + curyr int + curmon int + curday int + reqchan chan string + scanning bool + Scanwg *sync.WaitGroup // expose it so callers are able to wait for scan to finish + scanch chan struct{} + timedelta time.Duration + descriminator string + conf *fsarconf + //present tha archive as a restful resource + PutNotAllowed + PostNotAllowed + DeleteNotAllowed +} + +type mrtarchive struct { + *fsarchive +} + +type xmlarchive struct { + *fsarchive +} + +type fsarconf struct { + arfiles *timeentryslice + PutNotAllowed + PostNotAllowed + DeleteNotAllowed +} + +//in order not to block in gets, we need to +//fire a new goroutine to send the reply on the channel +// the reason is that we create the channel here and we must +//return it to the responsewriter and any sends would block +//without the receiver being ready. +func (fsc *fsarconf) Get(values url.Values) (int, chan reply) { + retc := make(chan reply) + go func() { + defer close(retc) //must close the chan to let the listener finish. + if fsc.arfiles == nil { + log.Printf("nil arfile in fsarconf. ignoring request\n") + return + } + if _, ok := values["range"]; ok { + if len(*fsc.arfiles) > 0 { + f := *fsc.arfiles + dates := fmt.Sprintf("%s - %s\n", f[0].sdate, f[len(f)-1].sdate) + retc <- reply{data: []byte(dates), err: nil} + return + } + retc <- reply{data: nil, err: errempty} + return + } + if _, ok := values["files"]; ok { + for _, f := range *fsc.arfiles { + retc <- reply{data: []byte(fmt.Sprintf("%s\n", filepath.Base(f.path))), err: nil} + } + return + } + return + }() + return 200, retc +} + +func (fsa *fsarchive) GetImpl(values url.Values, ar archive) (int, chan reply) { + var grwg sync.WaitGroup + retc := make(chan reply) + timeAstrs, ok1 := values["start"] + timeBstrs, ok2 := values["end"] + if len(timeAstrs) != len(timeBstrs) || !ok1 || !ok2 { + retc <- reply{data: nil, err: errbadreq} + goto done + } + for i := 0; i < len(timeAstrs); i++ { + log.Printf("timeAstr:%s timeBstr:%s", timeAstrs[i], timeBstrs[i]) + timeA, errtime := time.Parse("200601021504", timeAstrs[i]) + timeB, errtime := time.Parse("200601021504", timeBstrs[i]) + if errtime != nil || timeB.Before(timeA) { + retc <- reply{data: nil, err: errbaddate} + } else { + //buf.WriteString(fmt.Sprintf("quering from t0:%s - t1:%s\n", timeA, timeB)) + ar.Query(timeA, timeB, retc, &grwg) //this will fire a new goroutine + } + } + // the last goroutine that will wait for all we invoked and close the chan + go func(wg *sync.WaitGroup) { + wg.Wait() //wait for all the goroutines to finish sending + close(retc) //close the chan so that range in responsewriter will finish + log.Printf("closing the chan\n") + }(&grwg) +done: + return 200, retc +} + +func (fsa *mrtarchive) Get(values url.Values) (int, chan reply) { + return fsa.fsarchive.GetImpl(values, fsa) +} + +func (fsa *xmlarchive) Get(values url.Values) (int, chan reply) { + return fsa.fsarchive.GetImpl(values, fsa) +} + +func (ma *mrtarchive) Query(ta, tb time.Time, retc chan reply, wg *sync.WaitGroup) { + log.Printf("querying mrt from %s to %s\n", ta, tb) + go func(rc chan<- reply) { + wg.Add(1) + ef := *ma.entryfiles + var scanner *bufio.Scanner + defer wg.Done() + if len(ef) == 0 { + rc <- reply{nil, errempty} + return + } + if tb.Before(ef[0].sdate) || ta.After(ef[len(ef)-1].sdate.Add(ma.timedelta)) { + rc <- reply{nil, errdate} + return + } + i := sort.Search(len(ef), func(i int) bool { + return ef[i].sdate.After(ta.Add(-ma.timedelta - time.Second)) + }) + j := sort.Search(len(ef), func(i int) bool { + return ef[i].sdate.After(tb) + }) + for k := i; k < j; k++ { + fext := filepath.Ext(ef[k].path) + file, ferr := os.Open(ef[k].path) + if ferr != nil { + log.Println("failed opening file: ", ef[k].path, " ", ferr) + continue + } + if fext == ".bz2" { + log.Printf("bunzip2 file. opening decompression stream\n") + bzreader := bzip2.NewReader(file) + scanner = bufio.NewScanner(bzreader) + scanner.Split(mrt.SplitMrt) + } else { + log.Printf("no extension on file: %s. opening normally\n", ef[k].path) + scanner = bufio.NewScanner(file) + scanner.Split(mrt.SplitMrt) + } + //buf.WriteString(fmt.Sprintf(" [ file: %s ] ", ef[k].path)) + startt := time.Now() + for scanner.Scan() { + data := scanner.Bytes() + hdr, errh := mrt.NewMrtHdr(data[:mrt.MrtHdr_size]) + if errh != nil { + log.Printf("error in creating MRT header:%s", errh) + rc <- reply{data: nil, err: errh} + continue + } + date := time.Unix(int64(hdr.Mrt_timestamp), 0) + log.Printf("scanned mrt with date:%s", date) + /* + dateindi := strings.Index(str, "<DATETIME>") + if dateindi == -1 { + log.Println("could not locate DATETIME string in xml msg: ", str) + continue + } + dateindi = dateindi + 10 // go to start of date data + dateindj := strings.Index(str[dateindi:], "</DATETIME>") + if dateindj == -1 { + log.Println("could not locate closing </DATETIME> string in xml msg: ", str) + continue + } + dateindj = dateindj + dateindi // to return it to the relative start of line pos + xmldate, derr := time.Parse(time.RFC3339, str[dateindi:dateindj]) + if derr != nil { + log.Println("could not parse datetime: %s\n", derr) + continue + } + //log.Printf("parse xml message date: %s\n", xmldate) + if xmldate.After(ta) && xmldate.Before(tb) { + //buf.WriteString(fmt.Sprintf("%s\n",str)) + rc <- reply{data: []byte(fmt.Sprintf("%s\n", str)), err: nil} + } else if xmldate.After(tb) { //only later measurements in this file. leaving + break + }*/ + } + if err := scanner.Err(); err != nil && err != io.EOF { + log.Printf("file scanner error:%s\n", err) + } + log.Printf("finished parsing file %s size %d in %s\n", ef[k].path, ef[k].sz, time.Since(startt)) + file.Close() + } + return + }(retc) +} + +func (fsa *mrtarchive) visit(path string, f os.FileInfo, err error) error { + fname := f.Name() + log.Print("examining mrt: ", fname) + if strings.LastIndex(path, fsa.descriminator) == -1 { + log.Printf("visit: descriminator:%s not found in path:%s . ignoring\n", fsa.descriminator, path) + return nil + } + if f.Mode().IsRegular() { + numind := strings.IndexFunc(fname, unicode.IsDigit) + extind := strings.LastIndex(fname, ".bz2") + if numind == -1 || extind == -1 || extind-numind != 13 { + log.Print("file: ", fname, " not in foo.YYYYMMDD.HHMM.bz2... format. extind:%d numberind:%d", extind, numind) + return nil + } + datestr := fname[numind:extind] + log.Println("datestr in filename is ", datestr) + time, errtime := time.Parse("20060102.1504", datestr) + if errtime != nil { + log.Print("time.Parse() failed on file: ", fname, " that should be in fooHHMM format with error: ", errtime) + return nil + } + fsa.tempentryfiles = append(fsa.tempentryfiles, archentryfile{path: path, sdate: time, sz: f.Size()}) + } + return nil +} + +func (fsa *xmlarchive) Query(ta, tb time.Time, retc chan reply, wg *sync.WaitGroup) { + log.Printf("querying from %s to %s\n", ta, tb) + go func(rc chan<- reply) { + wg.Add(1) + defer wg.Done() + ef := *fsa.entryfiles + var scanner *bufio.Scanner + if len(ef) == 0 { + rc <- reply{nil, errempty} + return + } + if tb.Before(ef[0].sdate) || ta.After(ef[len(ef)-1].sdate.Add(fsa.timedelta)) { + rc <- reply{nil, errdate} + return + } + i := sort.Search(len(ef), func(i int) bool { + return ef[i].sdate.After(ta.Add(-fsa.timedelta - time.Second)) + }) + j := sort.Search(len(ef), func(i int) bool { + return ef[i].sdate.After(tb) + }) + for k := i; k < j; k++ { + fext := filepath.Ext(ef[k].path) + file, ferr := os.Open(ef[k].path) + if ferr != nil { + log.Println("failed opening file: ", ef[k].path, " ", ferr) + continue + } + if fext == "" || fext == ".xml" { + log.Printf("no extension on file: %s. opening normally\n", ef[k].path) + scanner = bufio.NewScanner(file) + } else if fext == ".bz2" { + log.Printf("bunzip2 file. opening decompression stream\n") + bzreader := bzip2.NewReader(file) + scanner = bufio.NewScanner(bzreader) + } else { + log.Printf("unhandled file extension: %s\n", ef[j].path) + continue + } + //buf.WriteString(fmt.Sprintf(" [ file: %s ] ", ef[k].path)) + startt := time.Now() + for scanner.Scan() { + str := scanner.Text() + dateindi := strings.Index(str, "<DATETIME>") + if dateindi == -1 { + log.Println("could not locate DATETIME string in xml msg: ", str) + continue + } + dateindi = dateindi + 10 // go to start of date data + dateindj := strings.Index(str[dateindi:], "</DATETIME>") + if dateindj == -1 { + log.Println("could not locate closing </DATETIME> string in xml msg: ", str) + continue + } + dateindj = dateindj + dateindi // to return it to the relative start of line pos + xmldate, derr := time.Parse(time.RFC3339, str[dateindi:dateindj]) + if derr != nil { + log.Println("could not parse datetime: %s\n", derr) + continue + } + //log.Printf("parse xml message date: %s\n", xmldate) + if xmldate.After(ta) && xmldate.Before(tb) { + //buf.WriteString(fmt.Sprintf("%s\n",str)) + rc <- reply{data: []byte(fmt.Sprintf("%s\n", str)), err: nil} + } else if xmldate.After(tb) { //only later measurements in this file. leaving + break + } + } + if err := scanner.Err(); err != nil && err != io.EOF { + log.Printf("file scanner error:%s\n", err) + } + log.Printf("finished parsing file %s size %d in %s\n", ef[k].path, ef[k].sz, time.Since(startt)) + file.Close() + } + return + }(retc) +} + +func NewMRTArchive(path, descr string) *mrtarchive { + return &mrtarchive{NewFsArchive(path, descr)} +} + +func NewFsArchive(path, descr string) *fsarchive { + return &fsarchive{ + rootpathstr: path, + entryfiles: &timeentryslice{}, + tempentryfiles: timeentryslice{}, + curyr: 0, + curmon: 0, + curday: 0, + reqchan: make(chan string), + scanning: false, + Scanwg: &sync.WaitGroup{}, + scanch: make(chan struct{}), + timedelta: 15 * time.Minute, + descriminator: descr, + conf: &fsarconf{}, + } +} + +func NewXmlArchive(path, descr string) *xmlarchive { + return &xmlarchive{NewFsArchive(path, descr)} +} + +//trying to see if a dir name is in YYYY.MM form +//returns true, year, month if it is, or false, 0, 0 if not. +func isYearMonthDir(fname string) (res bool, yr int, mon int) { + var err error + res = false + yr = 0 + mon = 0 + isdot := func(r rune) bool { + if r == '.' { + return true + } + return false + } + ind := strings.IndexFunc(fname, isdot) + //not found or in the form foo. + if ind == -1 || ind == len(fname) { + return + } + //not YYYY or MM + if len(fname[:ind]) != 4 || len(fname[ind+1:]) != 2 { + return + } + yr, err = strconv.Atoi(fname[:ind]) + if err != nil { + return + } + mon, err = strconv.Atoi(fname[ind+1:]) + if err != nil { + return + } + if mon < 1 || mon > 12 { + return + } + //the values were found to be valid + res = true + return +} + +func (fsa *xmlarchive) visit(path string, f os.FileInfo, err error) error { + fname := f.Name() + log.Print("examining ", fname) + if strings.LastIndex(path, fsa.descriminator) == -1 { + log.Printf("visit: descriminator:%s not found in path:%s . ignoring\n", fsa.descriminator, path) + return nil + } + + if f.Mode().IsRegular() { + numind := strings.IndexFunc(fname, unicode.IsDigit) + xmlind := strings.LastIndex(fname, ".xml") + if numind == -1 || xmlind == -1 || xmlind-numind != 13 { + log.Print("file: ", fname, " not in foo.YYYYMMDD.HHMM.xml... format") + return nil + } + datestr := fname[numind:xmlind] + log.Println("datestr in filename is ", datestr) + time, errtime := time.Parse("20060102.1504", datestr) + if errtime != nil { + log.Print("time.Parse() failed on file: ", fname, " that should be in fooHHMM format with error: ", errtime) + return nil + } + fsa.tempentryfiles = append(fsa.tempentryfiles, archentryfile{path: path, sdate: time, sz: f.Size()}) + } + return nil +} + +func (fsa *fsarchive) printEntries() { + log.Printf("dumping entries") + for _, ef := range *fsa.entryfiles { + fmt.Printf("%s %s\n", ef.path, ef.sdate) + } +} + +func (fsa *fsarchive) scan(ar archive) { + //clear the temp slice + fsa.tempentryfiles = []archentryfile{} + fsa.Scanwg.Add(1) + fsa.scanning = true + filepath.Walk(fsa.rootpathstr, ar.visit) + sort.Sort(fsa.tempentryfiles) + //allow the serve goroutine to unblock in case of STOP. + fsa.Scanwg.Done() + //signal the serve goroutine on scandone channel + fsa.scanch <- struct{}{} +} + +func (fsa *fsarchive) Serve(wg *sync.WaitGroup, ar archive) (reqchan chan<- string) { + if fsa.reqchan == nil { // we have closed the channel and now called again + fsa.reqchan = make(chan string) + } + wg.Add(1) + go func() { + defer wg.Done() + for { + select { + case req := <-fsa.reqchan: + switch req { + case "SCAN": + if fsa.scanning { + log.Print("fsarchive: already scanning. ignoring command") + } else { //fire an async goroutine to scan the files and wait for SCANDONE + go fsa.scan(ar) + } + case "DUMPENTRIES": + if fsa.scanning { + log.Print("fsar: warning. scanning in progress") + } + fsa.printEntries() + case "STOP": + log.Print("fsar: stopping") + fsa.Scanwg.Wait() + fsa.reqchan = nil //no more stuff from this channel + return + default: + log.Print("fsarchive: unknown request: ", req) + } + case <-fsa.scanch: + //update the reference to our file slice + fsa.entryfiles = &fsa.tempentryfiles + fsa.scanning = false + //let the config know + log.Printf("setting conf arfiles from :%v to a slice of len: %v\n", fsa.conf.arfiles, len(*fsa.entryfiles)) + fsa.conf.arfiles = fsa.entryfiles + log.Print("fsarchive: scan finished") + } + } + }() + return fsa.reqchan +} diff --git a/bgp.go b/bgp.go @@ -0,0 +1 @@ +package bgp diff --git a/cmd/archive_server.go b/cmd/archive_server.go @@ -0,0 +1,51 @@ +package main + +import ( + ar "go-bgp/archive" + "log" + "os" + "sync" +) + +func main() { + if len(os.Args) != 2 { + log.Fatal("usage: ", os.Args[0], " directory ") + } + basedirstr := os.Args[1] + ribmrtar := ar.NewMRTArchive(basedirstr, "RIBS") + wg1 := &sync.WaitGroup{} + mrtreqc := ribmrtar.Serve(wg1, ribmrtar) + mrtreqc <- "SCAN" + ribmrtar.Scanwg.Wait() + api := new(ar.API) + api.AddResource(ribmrtar, "/archive/mrt/ribs") + api.Start(3000) + close(mrtreqc) + wg1.Wait() + /* + updfsar := NewXmlArchive(basedirstr, "UPDATES") + //ribfsar := NewFsArchive(basedirstr, "RIBS") + wg2 := &sync.WaitGroup{} + updreqc := updfsar.serve(wg2, updfsar) + //ribreqc := updfsar.serve(wg2) + updreqc <- "SCAN" + updfsar.scanwg.Wait() + //ribfsar.scanwg.Wait() + //time.Sleep(time.Second*2) + updreqc <- "DUMPENTRIES" + api := new(API) + api.AddResource(updfsar, "/archive/updates") + //api.AddResource(ribfsar, "/archive/ribs") + api.AddResource(updfsar.conf, "/archive/updates/conf") + //api.AddResource(ribfsar.conf, "/archive/ribs/conf") + api.Start(3000) + //reqc<-"STOP" + + close(updreqc) + //close(ribreqc) + //wait for it + //wg1.Wait() + wg2.Wait() + */ + log.Print("all fsarchives stopped. exiting") +} diff --git a/doc/draft-ietf-grow-mrt-11.txt b/doc/draft-ietf-grow-mrt-11.txt @@ -0,0 +1,1625 @@ + + + +Network Working Group L. Blunk +Internet-Draft M. Karir +Intended status: Standards Track Merit Network +Expires: September 9, 2010 C. Labovitz + Arbor Networks + March 8, 2010 + + + MRT routing information export format + draft-ietf-grow-mrt-11.txt + +Abstract + + This document describes the MRT format for routing information + export. This format was developed in concert with the Multi-threaded + Routing Toolkit (MRT) from whence the format takes it name. The + format can be used to export routing protocol messages, state + changes, and routing information base contents. + +Status of this Memo + + This Internet-Draft is submitted to IETF in full conformance with the + provisions of BCP 78 and BCP 79. + + Internet-Drafts are working documents of the Internet Engineering + Task Force (IETF), its areas, and its working groups. Note that + other groups may also distribute working documents as Internet- + Drafts. + + Internet-Drafts are draft documents valid for a maximum of six months + and may be updated, replaced, or obsoleted by other documents at any + time. It is inappropriate to use Internet-Drafts as reference + material or to cite them other than as "work in progress." + + The list of current Internet-Drafts can be accessed at + http://www.ietf.org/ietf/1id-abstracts.txt. + + The list of Internet-Draft Shadow Directories can be accessed at + http://www.ietf.org/shadow.html. + + This Internet-Draft will expire on September 9, 2010. + +Copyright Notice + + Copyright (c) 2010 IETF Trust and the persons identified as the + document authors. All rights reserved. + + This document is subject to BCP 78 and the IETF Trust's Legal + + + +Blunk, et al. Expires September 9, 2010 [Page 1] + +Internet-Draft MRT Format March 2010 + + + Provisions Relating to IETF Documents + (http://trustee.ietf.org/license-info) in effect on the date of + publication of this document. Please review these documents + carefully, as they describe your rights and restrictions with respect + to this document. Code Components extracted from this document must + include Simplified BSD License text as described in Section 4.e of + the Trust Legal Provisions and are provided without warranty as + described in the BSD License. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 2] + +Internet-Draft MRT Format March 2010 + + +Table of Contents + + 1. Requirements notation . . . . . . . . . . . . . . . . . . . . 4 + 2. Introduction . . . . . . . . . . . . . . . . . . . . . . . . . 5 + 3. Basic MRT Format . . . . . . . . . . . . . . . . . . . . . . . 6 + 4. MRT Informational Types . . . . . . . . . . . . . . . . . . . 8 + 4.1. START Type . . . . . . . . . . . . . . . . . . . . . . . . 8 + 4.2. I_AM_DEAD Type . . . . . . . . . . . . . . . . . . . . . . 8 + 5. MRT Routing Information Types . . . . . . . . . . . . . . . . 9 + 5.1. OSPF Type . . . . . . . . . . . . . . . . . . . . . . . . 9 + 5.2. TABLE_DUMP Type . . . . . . . . . . . . . . . . . . . . . 10 + 5.3. TABLE_DUMP_V2 Type . . . . . . . . . . . . . . . . . . . . 11 + 5.4. BGP4MP Type . . . . . . . . . . . . . . . . . . . . . . . 14 + 5.4.1. BGP4MP_STATE_CHANGE Subtype . . . . . . . . . . . . . 14 + 5.4.2. BGP4MP_MESSAGE Subtype . . . . . . . . . . . . . . . . 15 + 5.4.3. BGP4MP_MESSAGE_AS4 Subtype . . . . . . . . . . . . . . 16 + 5.4.4. BGP4MP_STATE_CHANGE_AS4 Subtype . . . . . . . . . . . 16 + 5.4.5. BGP4MP_MESSAGE_LOCAL Subtype . . . . . . . . . . . . . 17 + 5.4.6. BGP4MP_MESSAGE_AS4_LOCAL Subtype . . . . . . . . . . . 17 + 5.5. BGP4MP_ET Type . . . . . . . . . . . . . . . . . . . . . . 17 + 5.6. ISIS Type . . . . . . . . . . . . . . . . . . . . . . . . 18 + 5.7. ISIS_ET Type . . . . . . . . . . . . . . . . . . . . . . . 18 + 5.8. OSPFv3 Type . . . . . . . . . . . . . . . . . . . . . . . 18 + 5.9. OSPFv3_ET Type . . . . . . . . . . . . . . . . . . . . . . 19 + 6. IANA Considerations . . . . . . . . . . . . . . . . . . . . . 20 + 6.1. Type Codes . . . . . . . . . . . . . . . . . . . . . . . . 20 + 6.2. Subtype Codes . . . . . . . . . . . . . . . . . . . . . . 20 + 7. Security Considerations . . . . . . . . . . . . . . . . . . . 21 + 8. References . . . . . . . . . . . . . . . . . . . . . . . . . . 22 + 8.1. Normative References . . . . . . . . . . . . . . . . . . . 22 + 8.2. Informative References . . . . . . . . . . . . . . . . . . 22 + Appendix A. Deprecated MRT types . . . . . . . . . . . . . . . . 23 + A.1. Deprecated MRT Informational Types . . . . . . . . . . . . 23 + A.1.1. NULL Type . . . . . . . . . . . . . . . . . . . . . . 23 + A.1.2. DIE Type . . . . . . . . . . . . . . . . . . . . . . . 23 + A.1.3. PEER_DOWN Type . . . . . . . . . . . . . . . . . . . . 23 + A.2. Deprecated MRT Routing Information Types . . . . . . . . . 23 + A.2.1. BGP Type . . . . . . . . . . . . . . . . . . . . . . . 23 + A.2.2. RIP Type . . . . . . . . . . . . . . . . . . . . . . . 26 + A.2.3. IDRP Type . . . . . . . . . . . . . . . . . . . . . . 26 + A.2.4. RIPNG Type . . . . . . . . . . . . . . . . . . . . . . 26 + A.2.5. BGP4PLUS and BGP4PLUS_01 Types . . . . . . . . . . . . 27 + A.2.6. Deprecated BGP4MP Subtypes . . . . . . . . . . . . . . 27 + Authors' Addresses . . . . . . . . . . . . . . . . . . . . . . . . 29 + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 3] + +Internet-Draft MRT Format March 2010 + + +1. Requirements notation + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in [RFC2119]. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 4] + +Internet-Draft MRT Format March 2010 + + +2. Introduction + + Researchers and engineers often wish to analyze network behavior by + studying routing protocol transactions and routing information base + snapshots. To this end, the MRT format was developed to encapsulate, + export, and archive this information in a standardized data + representation. The BGP routing protocol, in particular, has been + the subject of extensive study and analysis which has been + significantly aided by the availability of the MRT format. The MRT + format was initially defined in the MRT Programmer's Guide [MRT PROG + GUIDE]. + + This memo serves to document the MRT format as currently implemented + in publicly available software. The format has been extended since + it's original introduction in the MRT toolset and these extensions + are also included in this memo. Further extensions may be introduced + at a later date through additional definitions of the MRT Type field + and Subtype fields. + + A number of MRT message types have been documented in some references + but are not known to have been implemented. Further, several types + were employed in early MRT implementations, but are no longer + actively being used. These types are considered to be deprecated and + are documented in a separate appendix at the end of this document. + Some of the deprecated types may of interest to researchers examining + historical MRT archives. + + Fields which contain multi-octet numeric values are encoded in + network octet order from most significant octet to least significant + octet. Fields which contain routing message fields are encoded in + the same order as they appear in the packet contents. + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 5] + +Internet-Draft MRT Format March 2010 + + +3. Basic MRT Format + + All MRT format messages have a common header which includes a + timestamp, Type, Subtype, and length field. The header is followed + by a message field. The MRT common header is illustrated below. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Timestamp | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Subtype | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Message... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Header Field Descriptions: + + + Timestamp: + + Time in seconds since 1 January 1970 00:00:00 UTC + + + Type: + + A 2-octet field that indicates the Type of information + contained in the message field. Types 0 through 4 are + informational messages pertaining to the state of an MRT + collector, while Types 5 and higher are used to convey routing + information. + + + Subtype: + + A 2-octet field that is used to further distinguish message + information within a particular message Type. + + + Length: + + A 4-octet message length field. The length field contains the + number of octets within the message. The length field does not + include the length of the MRT common header. + + + + + +Blunk, et al. Expires September 9, 2010 [Page 6] + +Internet-Draft MRT Format March 2010 + + + + Message: + + A variable length message. The contents of this field are + context dependent upon the Type and Subtype fields. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 7] + +Internet-Draft MRT Format March 2010 + + +4. MRT Informational Types + + The MRT format defines five Informational Type messages. These + messages are intended to signal the state of an MRT data collector + and do not contain routing information. These messages are OPTIONAL + and were largely intended for use when MRT messages are sent over a + network to a remote repository store. However, MRT message + repository stores have traditionally resided on the same device as + the collector and these Informational Types have seen limited + implementation. Further, transport mechanisms for MRT messages are + considered to be outside the scope of this document. + + The START and I_AM_DEAD messages MAY be used to provide a time + reference when a data collector begins and ends the collection + process. The time reference is obtained from the Timestamp field in + the MRT message header. + + The message field MAY contain an OPTIONAL message string for + diagnostic purposes. The message string encoding MUST follow the + UTF-8 transformation format. The Subtype field is unused for these + Types and SHOULD be set to 0. + + The MRT Informational Types are defined below: + + 1 START + 3 I_AM_DEAD + +4.1. START Type + + The START Type indicates a collector is about to begin generating MRT + messages. + +4.2. I_AM_DEAD Type + + An I_AM_DEAD MRT message indicates that a collector has shut down and + has stopped generating MRT messages. + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 8] + +Internet-Draft MRT Format March 2010 + + +5. MRT Routing Information Types + + The following Types are currently defined for the MRT format. Types + 11 and 12 were defined in the MRT Toolkit package. The BGP4MP Type, + number 16, was initially defined in the Zebra routing software + package. The BGP4MP_ET, ISIS, and ISIS_ET Types were initially + defined in the Sprint Labs Python Routing Toolkit (PyRT). The OSPFv3 + and OSPFv3_ET Types are newly defined types created for the OSPFv3 + routing protocol. + + 11 OSPF + 12 TABLE_DUMP + 13 TABLE_DUMP_V2 + 16 BGP4MP + 17 BGP4MP_ET + 32 ISIS + 33 ISIS_ET + 48 OSPFv3 + 49 OSPFv3_ET + +5.1. OSPF Type + + This Type supports the OSPF Protocol as defined in RFC 2328 + [RFC2328]. The Subtype field may contain two possible values: + + 0 OSPF_STATE_CHANGE + 1 OSPF_LSA_UPDATE + + The format of the MRT Message field for the OSPF Type is as follows: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Remote IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | OSPF Message Contents (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 9] + +Internet-Draft MRT Format March 2010 + + +5.2. TABLE_DUMP Type + + The TABLE_DUMP Type is used to encode the contents of a BGP Routing + Information Base (RIB). Each RIB entry is encoded in a distinct + sequential MRT record. The Subtype field is used to encode whether + the RIB entry contains IPv4 or IPv6 addresses. There are two + possible values for the Subtype as shown below. + + 1 AFI_IPv4 + 2 AFI_IPv6 + + The format of the TABLE_DUMP Type is illustrated below. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | View # | Sequence number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prefix (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prefix Length | Status | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Originated Time | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS | Attribute Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP Attribute... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The View field is normally 0 and is intended for cases where an + implementation may have multiple RIB views (such as a route server). + In cases where multiple RIB views are present, an implementation may + use the the view field to distinguish entries from each view. The + Sequence field is a simple incremental counter for each RIB entry. A + typical RIB dump will exceed the 16-bit bounds of this counter and + implementation should simply wrap back to zero and continue + incrementing the counter in such cases. + + The Prefix field contains the IP address of a particular RIB entry. + The size of this field is dependent on the value of the Subtype for + this message. For AFI_IPv4, this field is 4 octets, for AFI_IPv6, it + is 16 octets in length. The Prefix Length field indicates the length + in bits of the prefix mask for the preceding Prefix field. + + The Status octet is not used in the TABLE_DUMP Type and SHOULD be set + to 1. + + + +Blunk, et al. Expires September 9, 2010 [Page 10] + +Internet-Draft MRT Format March 2010 + + + The Originated Time contains the 4-octet time at which this prefix + was heard. The value represents the time in seconds since 1 January + 1970 00:00:00 UTC. + + The Peer IP field is the IP address of the peer which provided the + update for this RIB entry. As with the Prefix field, the size of + this field is dependent on the Subtype. AFI_IPv4 indicates a 4 octet + field and an IPv4 address, while a Subtype of AFI_IPv6 requires a 16 + octet field and an IPv6 address. The Peer AS field contains the AS + number of the peer. + + Attribute length is the length of Attribute field and is 2-octets. + The Attribute field contains the attribute information for the RIB + entry. + +5.3. TABLE_DUMP_V2 Type + + The TABLE_DUMP_V2 Type updates the TABLE_DUMP Type to include 4-Byte + ASN support and full support for BGP Multiprotocol extensions. It + also improves upon the space efficiency of the TABLE_DUMP Type by + employing an index table for peers and permitting a single MRT record + per NLRI entry. The following subtypes are used with the + TABLE_DUMP_V2 Type. + + 1 PEER_INDEX_TABLE + 2 RIB_IPV4_UNICAST + 3 RIB_IPV4_MULTICAST + 4 RIB_IPV6_UNICAST + 5 RIB_IPV6_MULTICAST + 6 RIB_GENERIC + + An initial PEER_INDEX_TABLE MRT record provides the BGP ID of the + collector, an optional view name, and a list of indexed peers. + Following the PEER_INDEX_TABLE MRT record, a series of MRT records + are used to encode RIB table entries. This series of MRT records use + subtypes 2-6 and are separate from the PEER_INDEX_TABLE MRT record + itself and include full MRT record headers. The header of the + PEER_INDEX_TABLE Subtype is shown below. The View Name is optional + and, if not present, the View Name Length MUST be set to 0. The View + Name encoding MUST follow the UTF-8 transformation format. + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 11] + +Internet-Draft MRT Format March 2010 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Collector BGP ID | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | View Name Length | View Name (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer Count | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The format of the peer entries is shown below. The PEER_INDEX_TABLE + record contains Peer Count peer entries. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer Type | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer BGP ID | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The Peer Type, Peer BGP ID, Peer IP, and Peer AS fields are repeated + as indicated by the Peer Count field. The position of the Peer in + the PEER_INDEX_TABLE is used as an index in the subsequent + TABLE_DUMP_V2 MRT records. The index number begins with 0. + + The Peer Type field is a bit field which encodes the type of the AS + and IP address as follows: + + Bit 0 - unset for IPv4 Peer IP address, set for IPv6 + Bit 1 - unset when Peer AS is 16 bits, set when it's 32 bits + + The records which follow the PEER_INDEX_TABLE record constitute the + RIB entries and include a header which specifies a sequence number, + NLRI, and a count of the number of RIB entries which follow. + + The format for the RIB_IPV4_UNICAST, RIB_IPV4_MULTICAST, + RIB_IPV6_UNICAST, and RIB_IPV6_MULTICAST headers are shown below. + The Prefix Length and Prefix fields are encoded in the same manner as + the BGP NLRI encoding for IPV4 and IPV6 prefixes. Namely, the Prefix + field contains address prefixes followed by enough trailing bits to + make the end of the field fall on an octet boundary. Note that the + value of trailing bits is irrelevant. + + + + +Blunk, et al. Expires September 9, 2010 [Page 12] + +Internet-Draft MRT Format March 2010 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Sequence number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prefix Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prefix (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Entry Count | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The RIB_GENERIC header is shown below. It includes Address Family + Identifier (AFI), Subsequent AFI and a single NLRI entry. The NLRI + information is specific to the AFI and SAFI values. An + implementation which does not recognize particular AFI and SAFI + values SHOULD discard the remainder of the MRT record. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Sequence number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Address Family Identifier |Subsequent AFI | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Network Layer Reachability Information (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Entry Count | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The RIB entry headers are followed by a series of RIB entries which + are repeated Entry Count times. These entries share a common format + as shown below. They include a Peer Index from the PEER_INDEX_TABLE + MRT record, an originated time for the RIB entry, and the BGP path + attribute length and attributes encoded as provided in a BGP Update + message. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer Index | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Originated Time | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Attribute Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP Attributes... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + +Blunk, et al. Expires September 9, 2010 [Page 13] + +Internet-Draft MRT Format March 2010 + + + There is one exception to the encoding of BGP attributes for the BGP + MP_REACH_NLRI attribute (BGP Type Code 14) [RFC 4760]. Since the + AFI, SAFI, and NLRI information is already encoded in the + MULTIPROTOCOL header, only the Next Hop Address Length and Next Hop + Address fields are included. The Reserved field is omitted. The + attribute length is also adjusted to reflect only the length of the + Next Hop Address Length and Next Hop Address fields. + +5.4. BGP4MP Type + + This Type was initially defined in the Zebra software package for the + BGP protocol with multiprotocol extension support as defined by RFC + 4760 [RFC4760]. It supersedes the BGP, BGP4PLUS, BGP4PLUS_01 Types. + The BGP4MP Type has six Subtypes which are defined as follows: + + 0 BGP4MP_STATE_CHANGE + 1 BGP4MP_MESSAGE + 4 BGP4MP_MESSAGE_AS4 + 5 BGP4MP_STATE_CHANGE_AS4 + 6 BGP4MP_MESSAGE_LOCAL + 7 BGP4MP_MESSAGE_AS4_LOCAL + +5.4.1. BGP4MP_STATE_CHANGE Subtype + + This record is used to encode state changes in the BGP finite state + machine. The BGP FSM states are encoded in the Old State and New + State fields to indicate the previous and current state. In some + cases, the Peer AS number may be undefined. In such cases, the value + of this field may be set to zero. The format is illustrated below: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | Local AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Interface Index | Address Family | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Old State | New State | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 14] + +Internet-Draft MRT Format March 2010 + + + The FSM states are defined in RFC 4271 [RFC4271], Section 8.2.2. + Both the old state value and the new state value are encoded as + 2-octet numbers. The state values are defined numerically as + follows: + + 1 Idle + 2 Connect + 3 Active + 4 OpenSent + 5 OpenConfirm + 6 Established + + The BGP4MP_STATE_CHANGE message also includes interface index and + Address Family fields. The interface index provides the interface + number of the peering session. The index value is OPTIONAL and MAY + be zero if unknown or unsupported. The Address Family indicates what + types of addresses are in the the address fields. At present, the + following AFI Types are supported: + + 1 AFI_IPv4 + 2 AFI_IPv6 + +5.4.2. BGP4MP_MESSAGE Subtype + + This Subtype is used to encode BGP Messages. It can be used to + encode any Type of BGP message. The entire BGP message is + encapsulated in the BGP Message field, including the 16-octet marker, + the 2-octet length, and the 1-octet type fields. Note that the + BGP4MP_MESSAGE Subtype does not support 4-Byte AS numbers. Further, + the AS_PATH contained in these messages MUST only consist of 2-Byte + AS numbers. The BGP4MP_MESSAGE_AS4 Subtype updates the + BGP4MP_MESSAGE Subtype in order to support 4-Byte AS numbers. The + BGP4MP_MESSAGE fields are shown below: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | Local AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Interface Index | Address Family | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP Message... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + +Blunk, et al. Expires September 9, 2010 [Page 15] + +Internet-Draft MRT Format March 2010 + + + The interface index provides the interface number of the peering + session. The index value is OPTIONAL and MAY be zero if unknown or + unsupported. The Address Family indicates what types of addresses + are in the the subsequent address fields. At present, the following + AFI Types are supported: + + 1 AFI_IPv4 + 2 AFI_IPv6 + + Note that the Address Family value only applies to the IP addresses + contained in the MRT header. The BGP4MP_MESSAGE Subtype is otherwise + transparent to the contents of the actual message which may contain + any valid AFI/SAFI values. Only one BGP message may be encoded in + the BGP4MP_MESSAGE Subtype. + +5.4.3. BGP4MP_MESSAGE_AS4 Subtype + + This Subtype updates the BGP4MP_MESSAGE Subtype to support 4-Byte + Autonomous System numbers. The BGP4MP_MESSAGE_AS4 Subtype is + otherwise identical to the BGP4MP_MESSAGE Subtype. The AS_PATH in + these messages MUST only consist of 4-Byte AS numbers. The + BGP4MP_MESSAGE_AS4 fields are shown below: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Interface Index | Address Family | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP Message... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +5.4.4. BGP4MP_STATE_CHANGE_AS4 Subtype + + This Subtype updates the BGP4MP_STATE_CHANGE Subtype to support + 4-Byte Autonomous System numbers. As with the BGP4MP_STATE_CHANGE + Subtype, the BGP FSM states are encoded in the Old State and New + State fields to indicate the previous and current state. Aside from + the extension of the peer and local AS fields to 4-Bytes, this + subtype is otherwise identical to the BGP4MP_STATE_CHANGE Subtype. + The BGP4MP_STATE_CHANGE_AS4 fields are shown below: + + + +Blunk, et al. Expires September 9, 2010 [Page 16] + +Internet-Draft MRT Format March 2010 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Interface Index | Address Family | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Old State | New State | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +5.4.5. BGP4MP_MESSAGE_LOCAL Subtype + + Implementations of MRT have largely focused on collecting remotely + generated BGP messages in a passive route collector role. However, + for active BGP implementations, it can be useful to archive locally + generated BGP messages in addition to remote messages. This subtype + is added to indicated a locally generated BGP message. The fields + remain identical to the BGP4MP_MESSAGE type including the Peer and + Local IP and AS fields. The Local fields continue to refer to the + local IP and AS number of the collector which generated the message + and the Peer IP and AS fields refer to the receipient of the + generated BGP messages. + +5.4.6. BGP4MP_MESSAGE_AS4_LOCAL Subtype + + As with the BGP4MP_MESSAGE_LOCAL type, this type indicate locally + generated messages. The fields are identical to the + BGP4MP_MESSAGE_AS4 message type. + +5.5. BGP4MP_ET Type + + This Type was initially defined in the Sprint Labs Python Routing + Toolkit (PyRT). It extends the MRT common header field to include a + 32BIT microsecond timestamp field. The type and subtype field + definitions remain as defined for the BGP4MP Type. The 32BIT + microsecond timestamp immediately follows the length field in the MRT + common header and precedes all other fields in the message. The + 32BIT microsecond field is included in the computation of the length + field value. The MRT common header modification is illustrated + below. + + + + + +Blunk, et al. Expires September 9, 2010 [Page 17] + +Internet-Draft MRT Format March 2010 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Timestamp | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type | Subtype | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | microsecond timestamp | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Message... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +5.6. ISIS Type + + This Type was initially defined in the Sprint Labs Python Routing and + supports the IS-IS routing protocol as defined in RFC 1195 [RFC1195]. + There is no Type specific header for the ISIS Type. The Subtype code + for this Type is undefined. The ISIS PDU directly follows the MRT + common header fields. + +5.7. ISIS_ET Type + + The ISIS_ET Type extends the ISIS Type to support microsecond + timestamps. As with the BGP4MP_ET Type, a 32BIT microsecond + timestamp field is appended to the MRT common header after the length + field. The ISIS_ET Type is otherwise identical to the ISIS Type. + +5.8. OSPFv3 Type + + The OSPFv3 Type extends the original OSPF Type to support IPv6 + addresses for the OSPFv3 protocol as defined in RFC 5340 [RFC5340]. + The format of the MRT Message field for the OSPFv3 Type is as + follows: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Address Family | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Remote IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | OSPF Message Contents (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + +Blunk, et al. Expires September 9, 2010 [Page 18] + +Internet-Draft MRT Format March 2010 + + +5.9. OSPFv3_ET Type + + The OSPFv3_ET Type extends the OSPFv3 Type to support microsecond + timestamps. As with the BGP4MP_ET Type, a 32BIT microsecond + timestamp field is appended to the MRT common header after the length + field and its length is included in the calculation of the length + field value. The OSPFv3_ET Type is otherwise identical to the OSPFv3 + Type. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 19] + +Internet-Draft MRT Format March 2010 + + +6. IANA Considerations + + This section provides guidance to the Internet Assigned Numbers + Authority (IANA) regarding registration of values related to the MRT + specification, in accordance with BCP 26, RFC 5226 [RFC5226]. + + There are two name spaces in MRT that require registration: Type + Codes and Subtype Codes. + + MRT is not intended as a general-purpose specification for protocol + information export, and allocations should not be made for purposes + unrelated to routing protocol information export. + + The following policies are used here with the meanings defined in BCP + 26: "Specification Required", "IETF Consensus", "Experimental Use", + "First Come First Served". + +6.1. Type Codes + + Type Codes have a range from 0 to 65535, of which 1-64 have been + allocated. New Type Codes MUST be allocated starting at 65. Type + Codes 65 - 511 are to be assigned by IETF Review. Type Codes 512 - + 2047 are assigned based on Specification Required. Type Codes 2048 - + 64511 are available on a First Come First Served policy. Type Codes + 64512 - 65534 are available for Experimental Use. The Type Code + Values of 0 and 65535 are reserved. + +6.2. Subtype Codes + + Subtype Codes have a range from 0 to 65535. Subtype definitions are + specific to a particular Type Code definition. New Subtype Code + definition must reference an existing Type Code to which the Subtype + belongs. Subtype assignmnents to Type Codes 0 - 511 are to be + assigned by IETF Review. Subtype assignments for the remaning Type + Codes follow the assignment rules for the Type Codes to which they + belong. + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 20] + +Internet-Draft MRT Format March 2010 + + +7. Security Considerations + + The MRT Format utilizes a structure which can store routing protocol + information data. The fields defined in the MRT specification are of + a descriptive nature and provide information that is useful to + facilitate the analysis of routing data. As such, the fields + currently defined in the MRT specification do not in themselves + create additional security risks, since the fields are not used to + induce any particular behavior by the recipient application. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 21] + +Internet-Draft MRT Format March 2010 + + +8. References + +8.1. Normative References + + [RFC1058] Hedrick, C., "Routing Information Protocol", RFC 1058, + June 1988. + + [RFC1195] Callon, R., "Use of OSI IS-IS for routing in TCP/IP and + dual environments", RFC 1195, December 1990. + + [RFC2080] Malkin, G. and R. Minnear, "RIPng for IPv6", RFC 2080, + January 1997. + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, March 1997. + + [RFC2328] Moy, J., "OSPF Version 2", STD 54, RFC 2328, April 1998. + + [RFC4271] Rekhter, Y., Li, T., and S. Hares, "A Border Gateway + Protocol 4 (BGP-4)", RFC 4271, January 2006. + + [RFC4760] Bates, T., Chandra, R., Katz, D., and Y. Rekhter, + "Multiprotocol Extensions for BGP-4", RFC 4760, + January 2007. + + [RFC5226] Narten, T. and H. Alvestrand, "Guidelines for Writing an + IANA Considerations Section in RFCs", BCP 26, RFC 5226, + May 2008. + + [RFC5340] Coltun, R., Ferguson, D., Moy, J., and A. Lindem, "OSPF + for IPv6", RFC 5340, July 2008. + +8.2. Informative References + + [MRT PROG GUIDE] + Labovitz, C., "MRT Programmer's Guide", November 1999, + <http://www.merit.edu/networkresearch/mrtprogrammer.pdf>. + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 22] + +Internet-Draft MRT Format March 2010 + + +Appendix A. Deprecated MRT types + + This Appendix lists deprecated MRT types. These types are documented + for informational purposes only. While documented in some + references, they are not known to have been generally implemented. + +A.1. Deprecated MRT Informational Types + + The deprecated MRT Informational Types are defined below: + + 0 NULL + 2 DIE + 4 PEER_DOWN + +A.1.1. NULL Type + + The NULL Type message causes no operation. + +A.1.2. DIE Type + + The DIE Type signals a remote MRT repository it should stop accepting + messages. + +A.1.3. PEER_DOWN Type + + The PEER_DOWN message was intended to indicate that a collector had + lost association with a BGP peer. However, the MRT format provides + BGP state change message types which duplicate this functionality. + +A.2. Deprecated MRT Routing Information Types + + 5 BGP + 6 RIP + 7 IDRP + 8 RIPNG + 9 BGP4PLUS + 10 BGP4PLUS_01 + +A.2.1. BGP Type + + The BGP Type indicates the Message field contains BGP routing + information. The BGP routing protocol is defined in RFC 4271 + [RFC4271]. The information in the message is dependent on the + Subtype value. The BGP Type and all associated Subtypes below are + considered to be deprecated by the BGP4MP Type. + + The following BGP Subtypes are defined for the MRT BGP Type. As with + the BGP Type itself, they are all considered to be deprecated. + + + +Blunk, et al. Expires September 9, 2010 [Page 23] + +Internet-Draft MRT Format March 2010 + + + 0 BGP_NULL + 1 BGP_UPDATE + 2 BGP_PREF_UPDATE + 3 BGP_STATE_CHANGE + 4 BGP_SYNC + 5 BGP_OPEN + 6 BGP_NOTIFY + 7 BGP_KEEPALIVE + +A.2.1.1. BGP_NULL Subtype + + The BGP_NULL Subtype is a reserved Subtype. + +A.2.1.2. BGP_UPDATE Subtype + + The BGP_UPDATE Subtype is used to encode BGP UPDATE messages. The + format of the MRT Message field for this Subtype is as follows: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP UPDATE Contents (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The BGP UPDATE Contents include the entire BGP UPDATE message which + follows the BGP Message Header. The BGP Message Header itself is not + included. The Peer AS number and IP address fields contain the AS + number and IP address of the remote system which are generating the + BGP UPDATE messages. The Local AS number and IP address fields + contain the AS number and IP address of the local collector system + which is archiving the messages. + +A.2.1.3. BGP_PREF_UPDATE Subtype + + The BGP_PREF_UPDATE Subtype is not defined. + +A.2.1.4. BGP_STATE_CHANGE Subtype + + The BGP_STATE_CHANGE Subtype is used to record changes in the BGP + finite state machine. These FSM states are defined in RFC 4271 + + + +Blunk, et al. Expires September 9, 2010 [Page 24] + +Internet-Draft MRT Format March 2010 + + + [RFC4271], Section 8.2.2. Both the old state value and the new state + value are encoded as 2-octet numbers. The state values are defined + numerically as follows: + + 1 Idle + 2 Connect + 3 Active + 4 OpenSent + 5 OpenConfirm + 6 Established + + The format of the MRT Message field is as follows: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Old State | New State | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +A.2.1.5. BGP_SYNC Subtype + + The BGP_SYNC Subtype was intended to convey a system file name where + BGP Table Dump messages should be recorded. The View # was to + correspond to the View # provided in the TABLE_DUMP Type messages. + There are no known implementations of this subtype and it SHOULD be + ignored. The following format applies to this Subtype: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | View # | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | File Name... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The File Name is terminated with a NULL (0) character. + +A.2.1.6. BGP_OPEN Subtype + + The BGP_OPEN Subtype is used to encode BGP OPEN messages. The format + of the MRT Message field for this Subtype is the same as the + BGP_UPDATE, however, the last field contains the contents of the BGP + OPEN message. + + + + +Blunk, et al. Expires September 9, 2010 [Page 25] + +Internet-Draft MRT Format March 2010 + + +A.2.1.7. BGP_NOTIFY Subtype + + The BGP_NOTIFY Subtype is used to encode BGP NOTIFICATION messages. + The format of the MRT Message field for this Subtype is the same as + the BGP_UPDATE, however, the last field contains the contents of the + BGP NOTIFICATION message. + +A.2.1.8. BGP_KEEPALIVE Subtype + + The BGP_KEEPALIVE Subtype is used to encode BGP KEEPALIVE messages. + The format of the MRT Message field for this Subtype is the same as + the BGP_UPDATE, however, the last field contains no information. + +A.2.2. RIP Type + + The RIP Type is used to export RIP protocol packets as defined in RFC + 1058 [RFC1058]. The Subtype field is currently reserved for this + Type and SHOULD be set to 0. + + The format of the MRT Message field for the RIP Type is as follows: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | RIP Message Contents (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +A.2.3. IDRP Type + + The IDRP Type is used to export Inter-Domain-Routing Protocol (IDRP) + protocol information as defined in the ISO/IEC 10747 standard. The + Subtype field is unused. This Type is deprecated due to lack of + deployment of IDRP. + +A.2.4. RIPNG Type + + The RIPNG Type is used to export RIPNG protocol packets as defined in + RFC 2080 [RFC2080]. The RIPNG protocol updates the RIP protocol to + support IPv6. The Subtype field is currently reserved for this Type + and SHOULD be set to 0. + + The format of the MRT Message field for the RIPNG Type is as follows: + + + + + +Blunk, et al. Expires September 9, 2010 [Page 26] + +Internet-Draft MRT Format March 2010 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + ~ Peer IPv6 address ~ + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + ~ Local IPv6 address ~ + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | RIPNG Message Contents (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +A.2.5. BGP4PLUS and BGP4PLUS_01 Types + + The BGP4PLUS and BGP4PLUS_01 Types were defined to support IPv6 BGP + routing information. The BGP4PLUS Type was specified based on the + initial Internet Draft for Multiprotocol Extensions to BGP-4. The + BGP4PLUS_01 Type was specified to correspond to the -01 revision of + this Internet Draft. The two Types share the same definitions in + terms of their MRT format specifications. + + The Subtype field definitions are shared with the BGP Type, however, + the address fields in the BGP_UPDATE, BGP_OPEN, BGP_NOTIFY, + BGP_KEEPALIVE, and BGP_STATE_CHANGE Subtype messages are extended to + 16 octets for IPv6 addresses. As with the BGP Type, the BGP4PLUS and + BGP4PLUS_01 Types are deprecated as they superseded by the BGP4MP + Type. + +A.2.6. Deprecated BGP4MP Subtypes + + The following two subtypes of the BGP4MP Type are considered to be + deprecated. + + 2 BGP4MP_ENTRY + 3 BGP4MP_SNAPSHOT + +A.2.6.1. BGP4MP_ENTRY Subtype + + This Subtype is similar to the TABLE_DUMP Type and is used to record + RIB table entries. It extends the TABLE_DUMP Type to include true + multiprotocol support. However, this Type does not support 4-Byte AS + numbers and has not been widely implemented. This Type is deprecated + in favor of the TABLE_DUMP_V2 which includes 4-Byte AS number support + and a more compact format. + + + + + +Blunk, et al. Expires September 9, 2010 [Page 27] + +Internet-Draft MRT Format March 2010 + + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer AS number | Local AS number | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Interface Index | Address Family | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Peer IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local IP address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | View # | Status | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Time last change | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Address Family | SAFI | Next-Hop-Len | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Next Hop Address (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Prefix Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Address Prefix (variable) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Attribute Length | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP Attribute... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + +A.2.6.2. BGP4MP_SNAPSHOT Subtype + + This Subtype was intended to convey a system file name where + BGP4MP_ENTRY messages should be recorded. It is similar to the + BGP_SYNC message Subtype and is deprecated. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | View # | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | File Name... (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 28] + +Internet-Draft MRT Format March 2010 + + +Authors' Addresses + + Larry Blunk + Merit Network + + Email: ljb@merit.edu + + + Manish Karir + Merit Network + + Email: mkarir@merit.edu + + + Craig Labovitz + Arbor Networks + + Email: labovit@arbor.net + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Blunk, et al. Expires September 9, 2010 [Page 29] + + diff --git a/doc/rfc1771.txt b/doc/rfc1771.txt @@ -0,0 +1,3195 @@ + + + + + + +Network Working Group Y. Rekhter +Request for Comments: 1771 T.J. Watson Research Center, IBM Corp. +Obsoletes: 1654 T. Li +Category: Standards Track cisco Systems + Editors + March 1995 + + + A Border Gateway Protocol 4 (BGP-4) + +Status of this Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Abstract + + This document, together with its companion document, "Application of + the Border Gateway Protocol in the Internet", define an inter- + autonomous system routing protocol for the Internet. + +1. Acknowledgements + + This document was originally published as RFC 1267 in October 1991, + jointly authored by Kirk Lougheed (cisco Systems) and Yakov Rekhter + (IBM). + + We would like to express our thanks to Guy Almes (ANS), Len Bosack + (cisco Systems), and Jeffrey C. Honig (Cornell University) for their + contributions to the earlier version of this document. + + We like to explicitly thank Bob Braden (ISI) for the review of the + earlier version of this document as well as his constructive and + valuable comments. + + We would also like to thank Bob Hinden, Director for Routing of the + Internet Engineering Steering Group, and the team of reviewers he + assembled to review the previous version (BGP-2) of this document. + This team, consisting of Deborah Estrin, Milo Medin, John Moy, Radia + Perlman, Martha Steenstrup, Mike St. Johns, and Paul Tsuchiya, acted + with a strong combination of toughness, professionalism, and + courtesy. + + + + + + +Rekhter & Li [Page 1] + +RFC 1771 BGP-4 March 1995 + + + This updated version of the document is the product of the IETF IDR + Working Group with Yakov Rekhter and Tony Li as editors. Certain + sections of the document borrowed heavily from IDRP [7], which is the + OSI counterpart of BGP. For this credit should be given to the ANSI + X3S3.3 group chaired by Lyman Chapin (BBN) and to Charles Kunzinger + (IBM Corp.) who was the IDRP editor within that group. We would also + like to thank Mike Craren (Proteon, Inc.), Dimitry Haskin (Bay + Networks, Inc.), John Krawczyk (Bay Networks, Inc.), and Paul Traina + (cisco Systems) for their insightful comments. + + We would like to specially acknowledge numerous contributions by + Dennis Ferguson (MCI). + + The work of Yakov Rekhter was supported in part by the National + Science Foundation under Grant Number NCR-9219216. + +2. Introduction + + The Border Gateway Protocol (BGP) is an inter-Autonomous System + routing protocol. It is built on experience gained with EGP as + defined in RFC 904 [1] and EGP usage in the NSFNET Backbone as + described in RFC 1092 [2] and RFC 1093 [3]. + + The primary function of a BGP speaking system is to exchange network + reachability information with other BGP systems. This network + reachability information includes information on the list of + Autonomous Systems (ASs) that reachability information traverses. + This information is sufficient to construct a graph of AS + connectivity from which routing loops may be pruned and some policy + decisions at the AS level may be enforced. + + BGP-4 provides a new set of mechanisms for supporting classless + interdomain routing. These mechanisms include support for + advertising an IP prefix and eliminates the concept of network + "class" within BGP. BGP-4 also introduces mechanisms which allow + aggregation of routes, including aggregation of AS paths. These + changes provide support for the proposed supernetting scheme [8, 9]. + + To characterize the set of policy decisions that can be enforced + using BGP, one must focus on the rule that a BGP speaker advertise to + its peers (other BGP speakers which it communicates with) in + neighboring ASs only those routes that it itself uses. This rule + reflects the "hop-by-hop" routing paradigm generally used throughout + the current Internet. Note that some policies cannot be supported by + the "hop-by-hop" routing paradigm and thus require techniques such as + source routing to enforce. For example, BGP does not enable one AS + to send traffic to a neighboring AS intending that the traffic take a + different route from that taken by traffic originating in the + + + +Rekhter & Li [Page 2] + +RFC 1771 BGP-4 March 1995 + + + neighboring AS. On the other hand, BGP can support any policy + conforming to the "hop-by-hop" routing paradigm. Since the current + Internet uses only the "hop-by-hop" routing paradigm and since BGP + can support any policy that conforms to that paradigm, BGP is highly + applicable as an inter-AS routing protocol for the current Internet. + + A more complete discussion of what policies can and cannot be + enforced with BGP is outside the scope of this document (but refer to + the companion document discussing BGP usage [5]). + + BGP runs over a reliable transport protocol. This eliminates the + need to implement explicit update fragmentation, retransmission, + acknowledgement, and sequencing. Any authentication scheme used by + the transport protocol may be used in addition to BGP's own + authentication mechanisms. The error notification mechanism used in + BGP assumes that the transport protocol supports a "graceful" close, + i.e., that all outstanding data will be delivered before the + connection is closed. + + BGP uses TCP [4] as its transport protocol. TCP meets BGP's + transport requirements and is present in virtually all commercial + routers and hosts. In the following descriptions the phrase + "transport protocol connection" can be understood to refer to a TCP + connection. BGP uses TCP port 179 for establishing its connections. + + This document uses the term `Autonomous System' (AS) throughout. The + classic definition of an Autonomous System is a set of routers under + a single technical administration, using an interior gateway protocol + and common metrics to route packets within the AS, and using an + exterior gateway protocol to route packets to other ASs. Since this + classic definition was developed, it has become common for a single + AS to use several interior gateway protocols and sometimes several + sets of metrics within an AS. The use of the term Autonomous System + here stresses the fact that, even when multiple IGPs and metrics are + used, the administration of an AS appears to other ASs to have a + single coherent interior routing plan and presents a consistent + picture of what destinations are reachable through it. + + The planned use of BGP in the Internet environment, including such + issues as topology, the interaction between BGP and IGPs, and the + enforcement of routing policy rules is presented in a companion + document [5]. This document is the first of a series of documents + planned to explore various aspects of BGP application. Please send + comments to the BGP mailing list (bgp@ans.net). + + + + + + + +Rekhter & Li [Page 3] + +RFC 1771 BGP-4 March 1995 + + +3. Summary of Operation + + Two systems form a transport protocol connection between one another. + They exchange messages to open and confirm the connection parameters. + The initial data flow is the entire BGP routing table. Incremental + updates are sent as the routing tables change. BGP does not require + periodic refresh of the entire BGP routing table. Therefore, a BGP + speaker must retain the current version of the entire BGP routing + tables of all of its peers for the duration of the connection. + KeepAlive messages are sent periodically to ensure the liveness of + the connection. Notification messages are sent in response to errors + or special conditions. If a connection encounters an error + condition, a notification message is sent and the connection is + closed. + + The hosts executing the Border Gateway Protocol need not be routers. + A non-routing host could exchange routing information with routers + via EGP or even an interior routing protocol. That non-routing host + could then use BGP to exchange routing information with a border + router in another Autonomous System. The implications and + applications of this architecture are for further study. + + If a particular AS has multiple BGP speakers and is providing transit + service for other ASs, then care must be taken to ensure a consistent + view of routing within the AS. A consistent view of the interior + routes of the AS is provided by the interior routing protocol. A + consistent view of the routes exterior to the AS can be provided by + having all BGP speakers within the AS maintain direct BGP connections + with each other. Using a common set of policies, the BGP speakers + arrive at an agreement as to which border routers will serve as + exit/entry points for particular destinations outside the AS. This + information is communicated to the AS's internal routers, possibly + via the interior routing protocol. Care must be taken to ensure that + the interior routers have all been updated with transit information + before the BGP speakers announce to other ASs that transit service is + being provided. + + Connections between BGP speakers of different ASs are referred to as + "external" links. BGP connections between BGP speakers within the + same AS are referred to as "internal" links. Similarly, a peer in a + different AS is referred to as an external peer, while a peer in the + same AS may be described as an internal peer. + + + + + + + + + +Rekhter & Li [Page 4] + +RFC 1771 BGP-4 March 1995 + + +3.1 Routes: Advertisement and Storage + + For purposes of this protocol a route is defined as a unit of + information that pairs a destination with the attributes of a path to + that destination: + + - Routes are advertised between a pair of BGP speakers in UPDATE + messages: the destination is the systems whose IP addresses are + reported in the Network Layer Reachability Information (NLRI) + field, and the the path is the information reported in the path + attributes fields of the same UPDATE message. + + - Routes are stored in the Routing Information Bases (RIBs): + namely, the Adj-RIBs-In, the Loc-RIB, and the Adj-RIBs-Out. Routes + that will be advertised to other BGP speakers must be present in + the Adj-RIB-Out; routes that will be used by the local BGP speaker + must be present in the Loc-RIB, and the next hop for each of these + routes must be present in the local BGP speaker's forwarding + information base; and routes that are received from other BGP + speakers are present in the Adj-RIBs-In. + + If a BGP speaker chooses to advertise the route, it may add to or + modify the path attributes of the route before advertising it to a + peer. + + BGP provides mechanisms by which a BGP speaker can inform its peer + that a previously advertised route is no longer available for use. + There are three methods by which a given BGP speaker can indicate + that a route has been withdrawn from service: + + a) the IP prefix that expresses destinations for a previously + advertised route can be advertised in the WITHDRAWN ROUTES field + in the UPDATE message, thus marking the associated route as being + no longer available for use + + b) a replacement route with the same Network Layer Reachability + Information can be advertised, or + + c) the BGP speaker - BGP speaker connection can be closed, which + implicitly removes from service all routes which the pair of + speakers had advertised to each other. + + + + + + + + + + +Rekhter & Li [Page 5] + +RFC 1771 BGP-4 March 1995 + + +3.2 Routing Information Bases + + The Routing Information Base (RIB) within a BGP speaker consists of + three distinct parts: + + a) Adj-RIBs-In: The Adj-RIBs-In store routing information that has + been learned from inbound UPDATE messages. Their contents + represent routes that are available as an input to the Decision + Process. + + b) Loc-RIB: The Loc-RIB contains the local routing information + that the BGP speaker has selected by applying its local policies + to the routing information contained in its Adj-RIBs-In. + + c) Adj-RIBs-Out: The Adj-RIBs-Out store the information that the + local BGP speaker has selected for advertisement to its peers. The + routing information stored in the Adj-RIBs-Out will be carried in + the local BGP speaker's UPDATE messages and advertised to its + peers. + + In summary, the Adj-RIBs-In contain unprocessed routing information + that has been advertised to the local BGP speaker by its peers; the + Loc-RIB contains the routes that have been selected by the local BGP + speaker's Decision Process; and the Adj-RIBs-Out organize the routes + for advertisement to specific peers by means of the local speaker's + UPDATE messages. + + Although the conceptual model distinguishes between Adj-RIBs-In, + Loc-RIB, and Adj-RIBs-Out, this neither implies nor requires that an + implementation must maintain three separate copies of the routing + information. The choice of implementation (for example, 3 copies of + the information vs 1 copy with pointers) is not constrained by the + protocol. + +4. Message Formats + + This section describes message formats used by BGP. + + Messages are sent over a reliable transport protocol connection. A + message is processed only after it is entirely received. The maximum + message size is 4096 octets. All implementations are required to + support this maximum message size. The smallest message that may be + sent consists of a BGP header without a data portion, or 19 octets. + + + + + + + + +Rekhter & Li [Page 6] + +RFC 1771 BGP-4 March 1995 + + +4.1 Message Header Format + + Each message has a fixed-size header. There may or may not be a data + portion following the header, depending on the message type. The + layout of these fields is shown below: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + + + + | | + + + + | Marker | + + + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Length | Type | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Marker: + + This 16-octet field contains a value that the receiver of the + message can predict. If the Type of the message is OPEN, or if + the OPEN message carries no Authentication Information (as an + Optional Parameter), then the Marker must be all ones. + Otherwise, the value of the marker can be predicted by some a + computation specified as part of the authentication mechanism + (which is specified as part of the Authentication Information) + used. The Marker can be used to detect loss of synchronization + between a pair of BGP peers, and to authenticate incoming BGP + messages. + + Length: + + This 2-octet unsigned integer indicates the total length of the + message, including the header, in octets. Thus, e.g., it + allows one to locate in the transport-level stream the (Marker + field of the) next message. The value of the Length field must + always be at least 19 and no greater than 4096, and may be + further constrained, depending on the message type. No + "padding" of extra data after the message is allowed, so the + Length field must have the smallest value required given the + rest of the message. + + + + + + + +Rekhter & Li [Page 7] + +RFC 1771 BGP-4 March 1995 + + + Type: + + This 1-octet unsigned integer indicates the type code of the + message. The following type codes are defined: + + 1 - OPEN + 2 - UPDATE + 3 - NOTIFICATION + 4 - KEEPALIVE + +4.2 OPEN Message Format + + After a transport protocol connection is established, the first + message sent by each side is an OPEN message. If the OPEN message is + acceptable, a KEEPALIVE message confirming the OPEN is sent back. + Once the OPEN is confirmed, UPDATE, KEEPALIVE, and NOTIFICATION + messages may be exchanged. + + In addition to the fixed-size BGP header, the OPEN message contains + the following fields: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+ + | Version | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | My Autonomous System | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Hold Time | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | BGP Identifier | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Opt Parm Len | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | Optional Parameters | + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Version: + + This 1-octet unsigned integer indicates the protocol version + number of the message. The current BGP version number is 4. + + My Autonomous System: + + This 2-octet unsigned integer indicates the Autonomous System + number of the sender. + + + +Rekhter & Li [Page 8] + +RFC 1771 BGP-4 March 1995 + + + Hold Time: + + This 2-octet unsigned integer indicates the number of seconds + that the sender proposes for the value of the Hold Timer. Upon + receipt of an OPEN message, a BGP speaker MUST calculate the + value of the Hold Timer by using the smaller of its configured + Hold Time and the Hold Time received in the OPEN message. The + Hold Time MUST be either zero or at least three seconds. An + implementation may reject connections on the basis of the Hold + Time. The calculated value indicates the maximum number of + seconds that may elapse between the receipt of successive + KEEPALIVE, and/or UPDATE messages by the sender. + + BGP Identifier: + + This 4-octet unsigned integer indicates the BGP Identifier of + the sender. A given BGP speaker sets the value of its BGP + Identifier to an IP address assigned to that BGP speaker. The + value of the BGP Identifier is determined on startup and is the + same for every local interface and every BGP peer. + + Optional Parameters Length: + + This 1-octet unsigned integer indicates the total length of the + Optional Parameters field in octets. If the value of this field + is zero, no Optional Parameters are present. + + Optional Parameters: + + This field may contain a list of optional parameters, where + each parameter is encoded as a <Parameter Type, Parameter + Length, Parameter Value> triplet. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-... + | Parm. Type | Parm. Length | Parameter Value (variable) + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-... + + Parameter Type is a one octet field that unambiguously + identifies individual parameters. Parameter Length is a one + octet field that contains the length of the Parameter Value + field in octets. Parameter Value is a variable length field + that is interpreted according to the value of the Parameter + Type field. + + + + + + +Rekhter & Li [Page 9] + +RFC 1771 BGP-4 March 1995 + + + This document defines the following Optional Parameters: + + a) Authentication Information (Parameter Type 1): + + This optional parameter may be used to authenticate a BGP + peer. The Parameter Value field contains a 1-octet + Authentication Code followed by a variable length + Authentication Data. + + 0 1 2 3 4 5 6 7 8 + +-+-+-+-+-+-+-+-+ + | Auth. Code | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | | + | Authentication Data | + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Authentication Code: + + This 1-octet unsigned integer indicates the + authentication mechanism being used. Whenever an + authentication mechanism is specified for use within + BGP, three things must be included in the + specification: + + - the value of the Authentication Code which indicates + use of the mechanism, + - the form and meaning of the Authentication Data, and + - the algorithm for computing values of Marker fields. + + Note that a separate authentication mechanism may be + used in establishing the transport level connection. + + Authentication Data: + + The form and meaning of this field is a variable- + length field depend on the Authentication Code. + + The minimum length of the OPEN message is 29 octets (including + message header). + + + + + + + + + + +Rekhter & Li [Page 10] + +RFC 1771 BGP-4 March 1995 + + +4.3 UPDATE Message Format + + UPDATE messages are used to transfer routing information between BGP + peers. The information in the UPDATE packet can be used to construct + a graph describing the relationships of the various Autonomous + Systems. By applying rules to be discussed, routing information + loops and some other anomalies may be detected and removed from + inter-AS routing. + + An UPDATE message is used to advertise a single feasible route to a + peer, or to withdraw multiple unfeasible routes from service (see + 3.1). An UPDATE message may simultaneously advertise a feasible route + and withdraw multiple unfeasible routes from service. The UPDATE + message always includes the fixed-size BGP header, and can optionally + include the other fields as shown below: + + +-----------------------------------------------------+ + | Unfeasible Routes Length (2 octets) | + +-----------------------------------------------------+ + | Withdrawn Routes (variable) | + +-----------------------------------------------------+ + | Total Path Attribute Length (2 octets) | + +-----------------------------------------------------+ + | Path Attributes (variable) | + +-----------------------------------------------------+ + | Network Layer Reachability Information (variable) | + +-----------------------------------------------------+ + + Unfeasible Routes Length: + + This 2-octets unsigned integer indicates the total length of + the Withdrawn Routes field in octets. Its value must allow the + length of the Network Layer Reachability Information field to + be determined as specified below. + + A value of 0 indicates that no routes are being withdrawn from + service, and that the WITHDRAWN ROUTES field is not present in + this UPDATE message. + + Withdrawn Routes: + + This is a variable length field that contains a list of IP + address prefixes for the routes that are being withdrawn from + service. Each IP address prefix is encoded as a 2-tuple of the + form <length, prefix>, whose fields are described below: + + + + + + +Rekhter & Li [Page 11] + +RFC 1771 BGP-4 March 1995 + + + +---------------------------+ + | Length (1 octet) | + +---------------------------+ + | Prefix (variable) | + +---------------------------+ + + The use and the meaning of these fields are as follows: + + a) Length: + + The Length field indicates the length in bits of the IP + address prefix. A length of zero indicates a prefix that + matches all IP addresses (with prefix, itself, of zero + octets). + + b) Prefix: + + The Prefix field contains IP address prefixes followed by + enough trailing bits to make the end of the field fall on an + octet boundary. Note that the value of trailing bits is + irrelevant. + + Total Path Attribute Length: + + This 2-octet unsigned integer indicates the total length of the + Path Attributes field in octets. Its value must allow the + length of the Network Layer Reachability field to be determined + as specified below. + + A value of 0 indicates that no Network Layer Reachability + Information field is present in this UPDATE message. + + Path Attributes: + + A variable length sequence of path attributes is present in + every UPDATE. Each path attribute is a triple <attribute type, + attribute length, attribute value> of variable length. + + Attribute Type is a two-octet field that consists of the + Attribute Flags octet followed by the Attribute Type Code + octet. + + 0 1 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Attr. Flags |Attr. Type Code| + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + + + +Rekhter & Li [Page 12] + +RFC 1771 BGP-4 March 1995 + + + The high-order bit (bit 0) of the Attribute Flags octet is the + Optional bit. It defines whether the attribute is optional (if + set to 1) or well-known (if set to 0). + + The second high-order bit (bit 1) of the Attribute Flags octet + is the Transitive bit. It defines whether an optional + attribute is transitive (if set to 1) or non-transitive (if set + to 0). For well-known attributes, the Transitive bit must be + set to 1. (See Section 5 for a discussion of transitive + attributes.) + + The third high-order bit (bit 2) of the Attribute Flags octet + is the Partial bit. It defines whether the information + contained in the optional transitive attribute is partial (if + set to 1) or complete (if set to 0). For well-known attributes + and for optional non-transitive attributes the Partial bit must + be set to 0. + + The fourth high-order bit (bit 3) of the Attribute Flags octet + is the Extended Length bit. It defines whether the Attribute + Length is one octet (if set to 0) or two octets (if set to 1). + Extended Length may be used only if the length of the attribute + value is greater than 255 octets. + + The lower-order four bits of the Attribute Flags octet are . + unused. They must be zero (and must be ignored when received). + + The Attribute Type Code octet contains the Attribute Type Code. + Currently defined Attribute Type Codes are discussed in Section + 5. + + If the Extended Length bit of the Attribute Flags octet is set + to 0, the third octet of the Path Attribute contains the length + of the attribute data in octets. + + If the Extended Length bit of the Attribute Flags octet is set + to 1, then the third and the fourth octets of the path + attribute contain the length of the attribute data in octets. + + The remaining octets of the Path Attribute represent the + attribute value and are interpreted according to the Attribute + Flags and the Attribute Type Code. The supported Attribute Type + Codes, their attribute values and uses are the following: + + + + + + + + +Rekhter & Li [Page 13] + +RFC 1771 BGP-4 March 1995 + + + a) ORIGIN (Type Code 1): + + ORIGIN is a well-known mandatory attribute that defines the + origin of the path information. The data octet can assume + the following values: + + Value Meaning + + 0 IGP - Network Layer Reachability Information + is interior to the originating AS + + 1 EGP - Network Layer Reachability Information + learned via EGP + + 2 INCOMPLETE - Network Layer Reachability + Information learned by some other means + + Its usage is defined in 5.1.1 + + b) AS_PATH (Type Code 2): + + AS_PATH is a well-known mandatory attribute that is composed + of a sequence of AS path segments. Each AS path segment is + represented by a triple <path segment type, path segment + length, path segment value>. + + + + + + + + + + + + + + + + + + + + + + + + + + +Rekhter & Li [Page 14] + +RFC 1771 BGP-4 March 1995 + + + The path segment type is a 1-octet long field with the + following values defined: + + Value Segment Type + + 1 AS_SET: unordered set of ASs a route in the + UPDATE message has traversed + + 2 AS_SEQUENCE: ordered set of ASs a route in + the UPDATE message has traversed + + The path segment length is a 1-octet long field containing + the number of ASs in the path segment value field. + + The path segment value field contains one or more AS + numbers, each encoded as a 2-octets long field. + + Usage of this attribute is defined in 5.1.2. + + c) NEXT_HOP (Type Code 3): + + This is a well-known mandatory attribute that defines the IP + address of the border router that should be used as the next + hop to the destinations listed in the Network Layer + Reachability field of the UPDATE message. + + Usage of this attribute is defined in 5.1.3. + + d) MULTI_EXIT_DISC (Type Code 4): + + This is an optional non-transitive attribute that is a four + octet non-negative integer. The value of this attribute may + be used by a BGP speaker's decision process to discriminate + among multiple exit points to a neighboring autonomous + system. + + Its usage is defined in 5.1.4. + + e) LOCAL_PREF (Type Code 5): + + LOCAL_PREF is a well-known discretionary attribute that is a + four octet non-negative integer. It is used by a BGP speaker + to inform other BGP speakers in its own autonomous system of + the originating speaker's degree of preference for an + advertised route. Usage of this attribute is described in + 5.1.5. + + + + + +Rekhter & Li [Page 15] + +RFC 1771 BGP-4 March 1995 + + + f) ATOMIC_AGGREGATE (Type Code 6) + + ATOMIC_AGGREGATE is a well-known discretionary attribute of + length 0. It is used by a BGP speaker to inform other BGP + speakers that the local system selected a less specific + route without selecting a more specific route which is + included in it. Usage of this attribute is described in + 5.1.6. + + g) AGGREGATOR (Type Code 7) + + AGGREGATOR is an optional transitive attribute of length 6. + The attribute contains the last AS number that formed the + aggregate route (encoded as 2 octets), followed by the IP + address of the BGP speaker that formed the aggregate route + (encoded as 4 octets). Usage of this attribute is described + in 5.1.7 + + Network Layer Reachability Information: + + This variable length field contains a list of IP address + prefixes. The length in octets of the Network Layer + Reachability Information is not encoded explicitly, but can be + calculated as: + + UPDATE message Length - 23 - Total Path Attributes Length - + Unfeasible Routes Length + + where UPDATE message Length is the value encoded in the fixed- + size BGP header, Total Path Attribute Length and Unfeasible + Routes Length are the values encoded in the variable part of + the UPDATE message, and 23 is a combined length of the fixed- + size BGP header, the Total Path Attribute Length field and the + Unfeasible Routes Length field. + + Reachability information is encoded as one or more 2-tuples of + the form <length, prefix>, whose fields are described below: + + +---------------------------+ + | Length (1 octet) | + +---------------------------+ + | Prefix (variable) | + +---------------------------+ + + + + + + + + +Rekhter & Li [Page 16] + +RFC 1771 BGP-4 March 1995 + + + The use and the meaning of these fields are as follows: + + a) Length: + + The Length field indicates the length in bits of the IP + address prefix. A length of zero indicates a prefix that + matches all IP addresses (with prefix, itself, of zero + octets). + + b) Prefix: + + The Prefix field contains IP address prefixes followed by + enough trailing bits to make the end of the field fall on an + octet boundary. Note that the value of the trailing bits is + irrelevant. + + The minimum length of the UPDATE message is 23 octets -- 19 octets + for the fixed header + 2 octets for the Unfeasible Routes Length + 2 + octets for the Total Path Attribute Length (the value of Unfeasible + Routes Length is 0 and the value of Total Path Attribute Length is + 0). + + An UPDATE message can advertise at most one route, which may be + described by several path attributes. All path attributes contained + in a given UPDATE messages apply to the destinations carried in the + Network Layer Reachability Information field of the UPDATE message. + + An UPDATE message can list multiple routes to be withdrawn from + service. Each such route is identified by its destination (expressed + as an IP prefix), which unambiguously identifies the route in the + context of the BGP speaker - BGP speaker connection to which it has + been previously been advertised. + + An UPDATE message may advertise only routes to be withdrawn from + service, in which case it will not include path attributes or Network + Layer Reachability Information. Conversely, it may advertise only a + feasible route, in which case the WITHDRAWN ROUTES field need not be + present. + +4.4 KEEPALIVE Message Format + + BGP does not use any transport protocol-based keep-alive mechanism to + determine if peers are reachable. Instead, KEEPALIVE messages are + exchanged between peers often enough as not to cause the Hold Timer + to expire. A reasonable maximum time between KEEPALIVE messages + would be one third of the Hold Time interval. KEEPALIVE messages + MUST NOT be sent more frequently than one per second. An + implementation MAY adjust the rate at which it sends KEEPALIVE + + + +Rekhter & Li [Page 17] + +RFC 1771 BGP-4 March 1995 + + + messages as a function of the Hold Time interval. + + If the negotiated Hold Time interval is zero, then periodic KEEPALIVE + messages MUST NOT be sent. + + KEEPALIVE message consists of only message header and has a length of + 19 octets. + +4.5 NOTIFICATION Message Format + + A NOTIFICATION message is sent when an error condition is detected. + The BGP connection is closed immediately after sending it. + + In addition to the fixed-size BGP header, the NOTIFICATION message + contains the following fields: + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Error code | Error subcode | Data | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + Error Code: + + This 1-octet unsigned integer indicates the type of + NOTIFICATION. The following Error Codes have been defined: + + Error Code Symbolic Name Reference + + 1 Message Header Error Section 6.1 + + 2 OPEN Message Error Section 6.2 + + 3 UPDATE Message Error Section 6.3 + + 4 Hold Timer Expired Section 6.5 + + 5 Finite State Machine Error Section 6.6 + + 6 Cease Section 6.7 + + Error subcode: + + This 1-octet unsigned integer provides more specific + information about the nature of the reported error. Each Error + Code may have one or more Error Subcodes associated with it. + + + +Rekhter & Li [Page 18] + +RFC 1771 BGP-4 March 1995 + + + If no appropriate Error Subcode is defined, then a zero + (Unspecific) value is used for the Error Subcode field. + + Message Header Error subcodes: + + 1 - Connection Not Synchronized. + 2 - Bad Message Length. + 3 - Bad Message Type. + + OPEN Message Error subcodes: + + 1 - Unsupported Version Number. + 2 - Bad Peer AS. + 3 - Bad BGP Identifier. ' + 4 - Unsupported Optional Parameter. + 5 - Authentication Failure. + 6 - Unacceptable Hold Time. + + UPDATE Message Error subcodes: + + 1 - Malformed Attribute List. + 2 - Unrecognized Well-known Attribute. + 3 - Missing Well-known Attribute. + 4 - Attribute Flags Error. + 5 - Attribute Length Error. + 6 - Invalid ORIGIN Attribute + 7 - AS Routing Loop. + 8 - Invalid NEXT_HOP Attribute. + 9 - Optional Attribute Error. + 10 - Invalid Network Field. + 11 - Malformed AS_PATH. + + Data: + + This variable-length field is used to diagnose the reason for + the NOTIFICATION. The contents of the Data field depend upon + the Error Code and Error Subcode. See Section 6 below for more + details. + + Note that the length of the Data field can be determined from + the message Length field by the formula: + + Message Length = 21 + Data Length + + The minimum length of the NOTIFICATION message is 21 octets + (including message header). + + + + + +Rekhter & Li [Page 19] + +RFC 1771 BGP-4 March 1995 + + +5. Path Attributes + + This section discusses the path attributes of the UPDATE message. + + Path attributes fall into four separate categories: + + 1. Well-known mandatory. + 2. Well-known discretionary. + 3. Optional transitive. + 4. Optional non-transitive. + + Well-known attributes must be recognized by all BGP implementations. + Some of these attributes are mandatory and must be included in every + UPDATE message. Others are discretionary and may or may not be sent + in a particular UPDATE message. + + All well-known attributes must be passed along (after proper + updating, if necessary) to other BGP peers. + + In addition to well-known attributes, each path may contain one or + more optional attributes. It is not required or expected that all + BGP implementations support all optional attributes. The handling of + an unrecognized optional attribute is determined by the setting of + the Transitive bit in the attribute flags octet. Paths with + unrecognized transitive optional attributes should be accepted. If a + path with unrecognized transitive optional attribute is accepted and + passed along to other BGP peers, then the unrecognized transitive + optional attribute of that path must be passed along with the path to + other BGP peers with the Partial bit in the Attribute Flags octet set + to 1. If a path with recognized transitive optional attribute is + accepted and passed along to other BGP peers and the Partial bit in + the Attribute Flags octet is set to 1 by some previous AS, it is not + set back to 0 by the current AS. Unrecognized non-transitive optional + attributes must be quietly ignored and not passed along to other BGP + peers. + + New transitive optional attributes may be attached to the path by the + originator or by any other AS in the path. If they are not attached + by the originator, the Partial bit in the Attribute Flags octet is + set to 1. The rules for attaching new non-transitive optional + attributes will depend on the nature of the specific attribute. The + documentation of each new non-transitive optional attribute will be + expected to include such rules. (The description of the + MULTI_EXIT_DISC attribute gives an example.) All optional attributes + (both transitive and non-transitive) may be updated (if appropriate) + by ASs in the path. + + + + + +Rekhter & Li [Page 20] + +RFC 1771 BGP-4 March 1995 + + + The sender of an UPDATE message should order path attributes within + the UPDATE message in ascending order of attribute type. The + receiver of an UPDATE message must be prepared to handle path + attributes within the UPDATE message that are out of order. + + The same attribute cannot appear more than once within the Path + Attributes field of a particular UPDATE message. + +5.1 Path Attribute Usage + + The usage of each BGP path attributes is described in the following + clauses. + +5.1.1 ORIGIN + + ORIGIN is a well-known mandatory attribute. The ORIGIN attribute + shall be generated by the autonomous system that originates the + associated routing information. It shall be included in the UPDATE + messages of all BGP speakers that choose to propagate this + information to other BGP speakers. + +5.1.2 AS_PATH + + AS_PATH is a well-known mandatory attribute. This attribute + identifies the autonomous systems through which routing information + carried in this UPDATE message has passed. The components of this + list can be AS_SETs or AS_SEQUENCEs. + + When a BGP speaker propagates a route which it has learned from + another BGP speaker's UPDATE message, it shall modify the route's + AS_PATH attribute based on the location of the BGP speaker to which + the route will be sent: + + a) When a given BGP speaker advertises the route to another BGP + speaker located in its own autonomous system, the advertising + speaker shall not modify the AS_PATH attribute associated with the + route. + + b) When a given BGP speaker advertises the route to a BGP speaker + located in a neighboring autonomous system, then the advertising + speaker shall update the AS_PATH attribute as follows: + + 1) if the first path segment of the AS_PATH is of type + AS_SEQUENCE, the local system shall prepend its own AS number + as the last element of the sequence (put it in the leftmost + position). + + + + + +Rekhter & Li [Page 21] + +RFC 1771 BGP-4 March 1995 + + + 2) if the first path segment of the AS_PATH is of type AS_SET, + the local system shall prepend a new path segment of type + AS_SEQUENCE to the AS_PATH, including its own AS number in that + segment. + + When a BGP speaker originates a route then: + + a) the originating speaker shall include its own AS number in + the AS_PATH attribute of all UPDATE messages sent to BGP + speakers located in neighboring autonomous systems. (In this + case, the AS number of the originating speaker's autonomous + system will be the only entry in the AS_PATH attribute). + + b) the originating speaker shall include an empty AS_PATH + attribute in all UPDATE messages sent to BGP speakers located + in its own autonomous system. (An empty AS_PATH attribute is + one whose length field contains the value zero). + +5.1.3 NEXT_HOP + + The NEXT_HOP path attribute defines the IP address of the border + router that should be used as the next hop to the destinations listed + in the UPDATE message. If a border router belongs to the same AS as + its peer, then the peer is an internal border router. Otherwise, it + is an external border router. A BGP speaker can advertise any + internal border router as the next hop provided that the interface + associated with the IP address of this border router (as specified in + the NEXT_HOP path attribute) shares a common subnet with both the + local and remote BGP speakers. A BGP speaker can advertise any + external border router as the next hop, provided that the IP address + of this border router was learned from one of the BGP speaker's + peers, and the interface associated with the IP address of this + border router (as specified in the NEXT_HOP path attribute) shares a + common subnet with the local and remote BGP speakers. A BGP speaker + needs to be able to support disabling advertisement of external + border routers. + + A BGP speaker must never advertise an address of a peer to that peer + as a NEXT_HOP, for a route that the speaker is originating. A BGP + speaker must never install a route with itself as the next hop. + + When a BGP speaker advertises the route to a BGP speaker located in + its own autonomous system, the advertising speaker shall not modify + the NEXT_HOP attribute associated with the route. When a BGP speaker + receives the route via an internal link, it may forward packets to + the NEXT_HOP address if the address contained in the attribute is on + a common subnet with the local and remote BGP speakers. + + + + +Rekhter & Li [Page 22] + +RFC 1771 BGP-4 March 1995 + + +5.1.4 MULTI_EXIT_DISC + + The MULTI_EXIT_DISC attribute may be used on external (inter-AS) + links to discriminate among multiple exit or entry points to the same + neighboring AS. The value of the MULTI_EXIT_DISC attribute is a four + octet unsigned number which is called a metric. All other factors + being equal, the exit or entry point with lower metric should be + preferred. If received over external links, the MULTI_EXIT_DISC + attribute may be propagated over internal links to other BGP speakers + within the same AS. The MULTI_EXIT_DISC attribute is never + propagated to other BGP speakers in neighboring AS's. + +5.1.5 LOCAL_PREF + + LOCAL_PREF is a well-known discretionary attribute that shall be + included in all UPDATE messages that a given BGP speaker sends to the + other BGP speakers located in its own autonomous system. A BGP + speaker shall calculate the degree of preference for each external + route and include the degree of preference when advertising a route + to its internal peers. The higher degree of preference should be + preferred. A BGP speaker shall use the degree of preference learned + via LOCAL_PREF in its decision process (see section 9.1.1). + + A BGP speaker shall not include this attribute in UPDATE messages + that it sends to BGP speakers located in a neighboring autonomous + system. If it is contained in an UPDATE message that is received from + a BGP speaker which is not located in the same autonomous system as + the receiving speaker, then this attribute shall be ignored by the + receiving speaker. + +5.1.6 ATOMIC_AGGREGATE + + ATOMIC_AGGREGATE is a well-known discretionary attribute. If a BGP + speaker, when presented with a set of overlapping routes from one of + its peers (see 9.1.4), selects the less specific route without + selecting the more specific one, then the local system shall attach + the ATOMIC_AGGREGATE attribute to the route when propagating it to + other BGP speakers (if that attribute is not already present in the + received less specific route). A BGP speaker that receives a route + with the ATOMIC_AGGREGATE attribute shall not remove the attribute + from the route when propagating it to other speakers. A BGP speaker + that receives a route with the ATOMIC_AGGREGATE attribute shall not + make any NLRI of that route more specific (as defined in 9.1.4) when + advertising this route to other BGP speakers. A BGP speaker that + receives a route with the ATOMIC_AGGREGATE attribute needs to be + cognizant of the fact that the actual path to destinations, as + specified in the NLRI of the route, while having the loop-free + property, may traverse ASs that are not listed in the AS_PATH + + + +Rekhter & Li [Page 23] + +RFC 1771 BGP-4 March 1995 + + + attribute. + +5.1.7 AGGREGATOR + + AGGREGATOR is an optional transitive attribute which may be included + in updates which are formed by aggregation (see Section 9.2.4.2). A + BGP speaker which performs route aggregation may add the AGGREGATOR + attribute which shall contain its own AS number and IP address. + +6. BGP Error Handling. + + This section describes actions to be taken when errors are detected + while processing BGP messages. + + When any of the conditions described here are detected, a + NOTIFICATION message with the indicated Error Code, Error Subcode, + and Data fields is sent, and the BGP connection is closed. If no + Error Subcode is specified, then a zero must be used. + + The phrase "the BGP connection is closed" means that the transport + protocol connection has been closed and that all resources for that + BGP connection have been deallocated. Routing table entries + associated with the remote peer are marked as invalid. The fact that + the routes have become invalid is passed to other BGP peers before + the routes are deleted from the system. + + Unless specified explicitly, the Data field of the NOTIFICATION + message that is sent to indicate an error is empty. + +6.1 Message Header error handling. + + All errors detected while processing the Message Header are indicated + by sending the NOTIFICATION message with Error Code Message Header + Error. The Error Subcode elaborates on the specific nature of the + error. + + The expected value of the Marker field of the message header is all + ones if the message type is OPEN. The expected value of the Marker + field for all other types of BGP messages determined based on the + presence of the Authentication Information Optional Parameter in the + BGP OPEN message and the actual authentication mechanism (if the + Authentication Information in the BGP OPEN message is present). If + the Marker field of the message header is not the expected one, then + a synchronization error has occurred and the Error Subcode is set to + Connection Not Synchronized. + + + + + + +Rekhter & Li [Page 24] + +RFC 1771 BGP-4 March 1995 + + + If the Length field of the message header is less than 19 or greater + than 4096, or if the Length field of an OPEN message is less than + the minimum length of the OPEN message, or if the Length field of an + UPDATE message is less than the minimum length of the UPDATE message, + or if the Length field of a KEEPALIVE message is not equal to 19, or + if the Length field of a NOTIFICATION message is less than the + minimum length of the NOTIFICATION message, then the Error Subcode is + set to Bad Message Length. The Data field contains the erroneous + Length field. + + If the Type field of the message header is not recognized, then the + Error Subcode is set to Bad Message Type. The Data field contains + the erroneous Type field. + +6.2 OPEN message error handling. + + All errors detected while processing the OPEN message are indicated + by sending the NOTIFICATION message with Error Code OPEN Message + Error. The Error Subcode elaborates on the specific nature of the + error. + + If the version number contained in the Version field of the received + OPEN message is not supported, then the Error Subcode is set to + Unsupported Version Number. The Data field is a 2-octet unsigned + integer, which indicates the largest locally supported version number + less than the version the remote BGP peer bid (as indicated in the + received OPEN message). + + If the Autonomous System field of the OPEN message is unacceptable, + then the Error Subcode is set to Bad Peer AS. The determination of + acceptable Autonomous System numbers is outside the scope of this + protocol. + + If the Hold Time field of the OPEN message is unacceptable, then the + Error Subcode MUST be set to Unacceptable Hold Time. An + implementation MUST reject Hold Time values of one or two seconds. + An implementation MAY reject any proposed Hold Time. An + implementation which accepts a Hold Time MUST use the negotiated + value for the Hold Time. + + If the BGP Identifier field of the OPEN message is syntactically + incorrect, then the Error Subcode is set to Bad BGP Identifier. + Syntactic correctness means that the BGP Identifier field represents + a valid IP host address. + + If one of the Optional Parameters in the OPEN message is not + recognized, then the Error Subcode is set to Unsupported Optional + Parameters. + + + +Rekhter & Li [Page 25] + +RFC 1771 BGP-4 March 1995 + + + If the OPEN message carries Authentication Information (as an + Optional Parameter), then the corresponding authentication procedure + is invoked. If the authentication procedure (based on Authentication + Code and Authentication Data) fails, then the Error Subcode is set to + Authentication Failure. + +6.3 UPDATE message error handling. + + All errors detected while processing the UPDATE message are indicated + by sending the NOTIFICATION message with Error Code UPDATE Message + Error. The error subcode elaborates on the specific nature of the + error. + + Error checking of an UPDATE message begins by examining the path + attributes. If the Unfeasible Routes Length or Total Attribute + Length is too large (i.e., if Unfeasible Routes Length + Total + Attribute Length + 23 exceeds the message Length), then the Error + Subcode is set to Malformed Attribute List. + + If any recognized attribute has Attribute Flags that conflict with + the Attribute Type Code, then the Error Subcode is set to Attribute + Flags Error. The Data field contains the erroneous attribute (type, + length and value). + + If any recognized attribute has Attribute Length that conflicts with + the expected length (based on the attribute type code), then the + Error Subcode is set to Attribute Length Error. The Data field + contains the erroneous attribute (type, length and value). + + If any of the mandatory well-known attributes are not present, then + the Error Subcode is set to Missing Well-known Attribute. The Data + field contains the Attribute Type Code of the missing well-known + attribute. + + If any of the mandatory well-known attributes are not recognized, + then the Error Subcode is set to Unrecognized Well-known Attribute. + The Data field contains the unrecognized attribute (type, length and + value). + + If the ORIGIN attribute has an undefined value, then the Error + Subcode is set to Invalid Origin Attribute. The Data field contains + the unrecognized attribute (type, length and value). + + If the NEXT_HOP attribute field is syntactically incorrect, then the + Error Subcode is set to Invalid NEXT_HOP Attribute. The Data field + contains the incorrect attribute (type, length and value). Syntactic + correctness means that the NEXT_HOP attribute represents a valid IP + host address. Semantic correctness applies only to the external BGP + + + +Rekhter & Li [Page 26] + +RFC 1771 BGP-4 March 1995 + + + links. It means that the interface associated with the IP address, as + specified in the NEXT_HOP attribute, shares a common subnet with the + receiving BGP speaker and is not the IP address of the receiving BGP + speaker. If the NEXT_HOP attribute is semantically incorrect, the + error should be logged, and the the route should be ignored. In this + case, no NOTIFICATION message should be sent. + + The AS_PATH attribute is checked for syntactic correctness. If the + path is syntactically incorrect, then the Error Subcode is set to + Malformed AS_PATH. + + If an optional attribute is recognized, then the value of this + attribute is checked. If an error is detected, the attribute is + discarded, and the Error Subcode is set to Optional Attribute Error. + The Data field contains the attribute (type, length and value). + + If any attribute appears more than once in the UPDATE message, then + the Error Subcode is set to Malformed Attribute List. + + The NLRI field in the UPDATE message is checked for syntactic + validity. If the field is syntactically incorrect, then the Error + Subcode is set to Invalid Network Field. + +6.4 NOTIFICATION message error handling. + + If a peer sends a NOTIFICATION message, and there is an error in that + message, there is unfortunately no means of reporting this error via + a subsequent NOTIFICATION message. Any such error, such as an + unrecognized Error Code or Error Subcode, should be noticed, logged + locally, and brought to the attention of the administration of the + peer. The means to do this, however, lies outside the scope of this + document. + +6.5 Hold Timer Expired error handling. + + If a system does not receive successive KEEPALIVE and/or UPDATE + and/or NOTIFICATION messages within the period specified in the Hold + Time field of the OPEN message, then the NOTIFICATION message with + Hold Timer Expired Error Code must be sent and the BGP connection + closed. + +6.6 Finite State Machine error handling. + + Any error detected by the BGP Finite State Machine (e.g., receipt of + an unexpected event) is indicated by sending the NOTIFICATION message + with Error Code Finite State Machine Error. + + + + + +Rekhter & Li [Page 27] + +RFC 1771 BGP-4 March 1995 + + +6.7 Cease. + + In absence of any fatal errors (that are indicated in this section), + a BGP peer may choose at any given time to close its BGP connection + by sending the NOTIFICATION message with Error Code Cease. However, + the Cease NOTIFICATION message must not be used when a fatal error + indicated by this section does exist. + +6.8 Connection collision detection. + + If a pair of BGP speakers try simultaneously to establish a TCP + connection to each other, then two parallel connections between this + pair of speakers might well be formed. We refer to this situation as + connection collision. Clearly, one of these connections must be + closed. + + Based on the value of the BGP Identifier a convention is established + for detecting which BGP connection is to be preserved when a + collision does occur. The convention is to compare the BGP + Identifiers of the peers involved in the collision and to retain only + the connection initiated by the BGP speaker with the higher-valued + BGP Identifier. + + Upon receipt of an OPEN message, the local system must examine all of + its connections that are in the OpenConfirm state. A BGP speaker may + also examine connections in an OpenSent state if it knows the BGP + Identifier of the peer by means outside of the protocol. If among + these connections there is a connection to a remote BGP speaker whose + BGP Identifier equals the one in the OPEN message, then the local + system performs the following collision resolution procedure: + + 1. The BGP Identifier of the local system is compared to the BGP + Identifier of the remote system (as specified in the OPEN + message). + + 2. If the value of the local BGP Identifier is less than the + remote one, the local system closes BGP connection that already + exists (the one that is already in the OpenConfirm state), and + accepts BGP connection initiated by the remote system. + + 3. Otherwise, the local system closes newly created BGP connection + (the one associated with the newly received OPEN message), and + continues to use the existing one (the one that is already in the + OpenConfirm state). + + Comparing BGP Identifiers is done by treating them as (4-octet + long) unsigned integers. + + + + +Rekhter & Li [Page 28] + +RFC 1771 BGP-4 March 1995 + + + A connection collision with an existing BGP connection that is in + Established states causes unconditional closing of the newly + created connection. Note that a connection collision cannot be + detected with connections that are in Idle, or Connect, or Active + states. + + Closing the BGP connection (that results from the collision + resolution procedure) is accomplished by sending the NOTIFICATION + message with the Error Code Cease. + +7. BGP Version Negotiation. + + BGP speakers may negotiate the version of the protocol by making + multiple attempts to open a BGP connection, starting with the highest + version number each supports. If an open attempt fails with an Error + Code OPEN Message Error, and an Error Subcode Unsupported Version + Number, then the BGP speaker has available the version number it + tried, the version number its peer tried, the version number passed + by its peer in the NOTIFICATION message, and the version numbers that + it supports. If the two peers do support one or more common + versions, then this will allow them to rapidly determine the highest + common version. In order to support BGP version negotiation, future + versions of BGP must retain the format of the OPEN and NOTIFICATION + messages. + +8. BGP Finite State machine. + + This section specifies BGP operation in terms of a Finite State + Machine (FSM). Following is a brief summary and overview of BGP + operations by state as determined by this FSM. A condensed version + of the BGP FSM is found in Appendix 1. + + Initially BGP is in the Idle state. + + Idle state: + + In this state BGP refuses all incoming BGP connections. No + resources are allocated to the peer. In response to the Start + event (initiated by either system or operator) the local system + initializes all BGP resources, starts the ConnectRetry timer, + initiates a transport connection to other BGP peer, while + listening for connection that may be initiated by the remote + BGP peer, and changes its state to Connect. The exact value of + the ConnectRetry timer is a local matter, but should be + sufficiently large to allow TCP initialization. + + If a BGP speaker detects an error, it shuts down the connection + and changes its state to Idle. Getting out of the Idle state + + + +Rekhter & Li [Page 29] + +RFC 1771 BGP-4 March 1995 + + + requires generation of the Start event. If such an event is + generated automatically, then persistent BGP errors may result + in persistent flapping of the speaker. To avoid such a + condition it is recommended that Start events should not be + generated immediately for a peer that was previously + transitioned to Idle due to an error. For a peer that was + previously transitioned to Idle due to an error, the time + between consecutive generation of Start events, if such events + are generated automatically, shall exponentially increase. The + value of the initial timer shall be 60 seconds. The time shall + be doubled for each consecutive retry. + + Any other event received in the Idle state is ignored. + + Connect state: + + In this state BGP is waiting for the transport protocol + connection to be completed. + + If the transport protocol connection succeeds, the local system + clears the ConnectRetry timer, completes initialization, sends + an OPEN message to its peer, and changes its state to OpenSent. + + If the transport protocol connect fails (e.g., retransmission + timeout), the local system restarts the ConnectRetry timer, + continues to listen for a connection that may be initiated by + the remote BGP peer, and changes its state to Active state. + + In response to the ConnectRetry timer expired event, the local + system restarts the ConnectRetry timer, initiates a transport + connection to other BGP peer, continues to listen for a + connection that may be initiated by the remote BGP peer, and + stays in the Connect state. + + Start event is ignored in the Active state. + + In response to any other event (initiated by either system or + operator), the local system releases all BGP resources + associated with this connection and changes its state to Idle. + + Active state: + + In this state BGP is trying to acquire a peer by initiating a + transport protocol connection. + + If the transport protocol connection succeeds, the local system + clears the ConnectRetry timer, completes initialization, sends + an OPEN message to its peer, sets its Hold Timer to a large + + + +Rekhter & Li [Page 30] + +RFC 1771 BGP-4 March 1995 + + + value, and changes its state to OpenSent. A Hold Timer value + of 4 minutes is suggested. + + In response to the ConnectRetry timer expired event, the local + system restarts the ConnectRetry timer, initiates a transport + connection to other BGP peer, continues to listen for a + connection that may be initiated by the remote BGP peer, and + changes its state to Connect. + + If the local system detects that a remote peer is trying to + establish BGP connection to it, and the IP address of the + remote peer is not an expected one, the local system restarts + the ConnectRetry timer, rejects the attempted connection, + continues to listen for a connection that may be initiated by + the remote BGP peer, and stays in the Active state. + + Start event is ignored in the Active state. + + In response to any other event (initiated by either system or + operator), the local system releases all BGP resources + associated with this connection and changes its state to Idle. + + OpenSent state: + + In this state BGP waits for an OPEN message from its peer. + When an OPEN message is received, all fields are checked for + correctness. If the BGP message header checking or OPEN + message checking detects an error (see Section 6.2), or a + connection collision (see Section 6.8) the local system sends a + NOTIFICATION message and changes its state to Idle. + + If there are no errors in the OPEN message, BGP sends a + KEEPALIVE message and sets a KeepAlive timer. The Hold Timer, + which was originally set to a large value (see above), is + replaced with the negotiated Hold Time value (see section 4.2). + If the negotiated Hold Time value is zero, then the Hold Time + timer and KeepAlive timers are not started. If the value of + the Autonomous System field is the same as the local Autonomous + System number, then the connection is an "internal" connection; + otherwise, it is "external". (This will effect UPDATE + processing as described below.) Finally, the state is changed + to OpenConfirm. + + If a disconnect notification is received from the underlying + transport protocol, the local system closes the BGP connection, + restarts the ConnectRetry timer, while continue listening for + connection that may be initiated by the remote BGP peer, and + goes into the Active state. + + + +Rekhter & Li [Page 31] + +RFC 1771 BGP-4 March 1995 + + + If the Hold Timer expires, the local system sends NOTIFICATION + message with error code Hold Timer Expired and changes its + state to Idle. + + In response to the Stop event (initiated by either system or + operator) the local system sends NOTIFICATION message with + Error Code Cease and changes its state to Idle. + + Start event is ignored in the OpenSent state. + + In response to any other event the local system sends + NOTIFICATION message with Error Code Finite State Machine Error + and changes its state to Idle. + + Whenever BGP changes its state from OpenSent to Idle, it closes + the BGP (and transport-level) connection and releases all + resources associated with that connection. + + OpenConfirm state: + + In this state BGP waits for a KEEPALIVE or NOTIFICATION + message. + + If the local system receives a KEEPALIVE message, it changes + its state to Established. + + If the Hold Timer expires before a KEEPALIVE message is + received, the local system sends NOTIFICATION message with + error code Hold Timer Expired and changes its state to Idle. + + If the local system receives a NOTIFICATION message, it changes + its state to Idle. + + If the KeepAlive timer expires, the local system sends a + KEEPALIVE message and restarts its KeepAlive timer. + + If a disconnect notification is received from the underlying + transport protocol, the local system changes its state to Idle. + + In response to the Stop event (initiated by either system or + operator) the local system sends NOTIFICATION message with + Error Code Cease and changes its state to Idle. + + Start event is ignored in the OpenConfirm state. + + In response to any other event the local system sends + NOTIFICATION message with Error Code Finite State Machine Error + and changes its state to Idle. + + + +Rekhter & Li [Page 32] + +RFC 1771 BGP-4 March 1995 + + + Whenever BGP changes its state from OpenConfirm to Idle, it + closes the BGP (and transport-level) connection and releases + all resources associated with that connection. + + Established state: + + In the Established state BGP can exchange UPDATE, NOTIFICATION, + and KEEPALIVE messages with its peer. + + If the local system receives an UPDATE or KEEPALIVE message, it + restarts its Hold Timer, if the negotiated Hold Time value is + non-zero. + + If the local system receives a NOTIFICATION message, it changes + its state to Idle. + + If the local system receives an UPDATE message and the UPDATE + message error handling procedure (see Section 6.3) detects an + error, the local system sends a NOTIFICATION message and + changes its state to Idle. + + If a disconnect notification is received from the underlying + transport protocol, the local system changes its state to Idle. + + If the Hold Timer expires, the local system sends a + NOTIFICATION message with Error Code Hold Timer Expired and + changes its state to Idle. + + If the KeepAlive timer expires, the local system sends a + KEEPALIVE message and restarts its KeepAlive timer. + + Each time the local system sends a KEEPALIVE or UPDATE message, + it restarts its KeepAlive timer, unless the negotiated Hold + Time value is zero. + + In response to the Stop event (initiated by either system or + operator), the local system sends a NOTIFICATION message with + Error Code Cease and changes its state to Idle. + + Start event is ignored in the Established state. + + In response to any other event, the local system sends + NOTIFICATION message with Error Code Finite State Machine Error + and changes its state to Idle. + + Whenever BGP changes its state from Established to Idle, it + closes the BGP (and transport-level) connection, releases all + resources associated with that connection, and deletes all + + + +Rekhter & Li [Page 33] + +RFC 1771 BGP-4 March 1995 + + + routes derived from that connection. + +9. UPDATE Message Handling + + An UPDATE message may be received only in the Established state. + When an UPDATE message is received, each field is checked for + validity as specified in Section 6.3. + + If an optional non-transitive attribute is unrecognized, it is + quietly ignored. If an optional transitive attribute is + unrecognized, the Partial bit (the third high-order bit) in the + attribute flags octet is set to 1, and the attribute is retained for + propagation to other BGP speakers. + + If an optional attribute is recognized, and has a valid value, then, + depending on the type of the optional attribute, it is processed + locally, retained, and updated, if necessary, for possible + propagation to other BGP speakers. + + If the UPDATE message contains a non-empty WITHDRAWN ROUTES field, + the previously advertised routes whose destinations (expressed as IP + prefixes) contained in this field shall be removed from the Adj-RIB- + In. This BGP speaker shall run its Decision Process since the + previously advertised route is not longer available for use. + + If the UPDATE message contains a feasible route, it shall be placed + in the appropriate Adj-RIB-In, and the following additional actions + shall be taken: + + i) If its Network Layer Reachability Information (NLRI) is identical + to the one of a route currently stored in the Adj-RIB-In, then the + new route shall replace the older route in the Adj-RIB-In, thus + implicitly withdrawing the older route from service. The BGP speaker + shall run its Decision Process since the older route is no longer + available for use. + + ii) If the new route is an overlapping route that is included (see + 9.1.4) in an earlier route contained in the Adj-RIB-In, the BGP + speaker shall run its Decision Process since the more specific route + has implicitly made a portion of the less specific route unavailable + for use. + + iii) If the new route has identical path attributes to an earlier + route contained in the Adj-RIB-In, and is more specific (see 9.1.4) + than the earlier route, no further actions are necessary. + + iv) If the new route has NLRI that is not present in any of the + routes currently stored in the Adj-RIB-In, then the new route shall + + + +Rekhter & Li [Page 34] + +RFC 1771 BGP-4 March 1995 + + + be placed in the Adj-RIB-In. The BGP speaker shall run its Decision + Process. + + v) If the new route is an overlapping route that is less specific + (see 9.1.4) than an earlier route contained in the Adj-RIB-In, the + BGP speaker shall run its Decision Process on the set of destinations + described only by the less specific route. + +9.1 Decision Process + + The Decision Process selects routes for subsequent advertisement by + applying the policies in the local Policy Information Base (PIB) to + the routes stored in its Adj-RIB-In. The output of the Decision + Process is the set of routes that will be advertised to all peers; + the selected routes will be stored in the local speaker's Adj-RIB- + Out. + + The selection process is formalized by defining a function that takes + the attribute of a given route as an argument and returns a non- + negative integer denoting the degree of preference for the route. + The function that calculates the degree of preference for a given + route shall not use as its inputs any of the following: the + existence of other routes, the non-existence of other routes, or the + path attributes of other routes. Route selection then consists of + individual application of the degree of preference function to each + feasible route, followed by the choice of the one with the highest + degree of preference. + + The Decision Process operates on routes contained in each Adj-RIB-In, + and is responsible for: + + - selection of routes to be advertised to BGP speakers located in + the local speaker's autonomous system + + - selection of routes to be advertised to BGP speakers located in + neighboring autonomous systems + + - route aggregation and route information reduction + + The Decision Process takes place in three distinct phases, each + triggered by a different event: + + a) Phase 1 is responsible for calculating the degree of preference + for each route received from a BGP speaker located in a + neighboring autonomous system, and for advertising to the other + BGP speakers in the local autonomous system the routes that have + the highest degree of preference for each distinct destination. + + + + +Rekhter & Li [Page 35] + +RFC 1771 BGP-4 March 1995 + + + b) Phase 2 is invoked on completion of phase 1. It is responsible + for choosing the best route out of all those available for each + distinct destination, and for installing each chosen route into + the appropriate Loc-RIB. + + c) Phase 3 is invoked after the Loc-RIB has been modified. It is + responsible for disseminating routes in the Loc-RIB to each peer + located in a neighboring autonomous system, according to the + policies contained in the PIB. Route aggregation and information + reduction can optionally be performed within this phase. + +9.1.1 Phase 1: Calculation of Degree of Preference + + The Phase 1 decision function shall be invoked whenever the local BGP + speaker receives an UPDATE message from a peer located in a + neighboring autonomous system that advertises a new route, a + replacement route, or a withdrawn route. + + The Phase 1 decision function is a separate process which completes + when it has no further work to do. + + The Phase 1 decision function shall lock an Adj-RIB-In prior to + operating on any route contained within it, and shall unlock it after + operating on all new or unfeasible routes contained within it. + + For each newly received or replacement feasible route, the local BGP + speaker shall determine a degree of preference. If the route is + learned from a BGP speaker in the local autonomous system, either the + value of the LOCAL_PREF attribute shall be taken as the degree of + preference, or the local system shall compute the degree of + preference of the route based on preconfigured policy information. If + the route is learned from a BGP speaker in a neighboring autonomous + system, then the degree of preference shall be computed based on + preconfigured policy information. The exact nature of this policy + information and the computation involved is a local matter. The + local speaker shall then run the internal update process of 9.2.1 to + select and advertise the most preferable route. + +9.1.2 Phase 2: Route Selection + + The Phase 2 decision function shall be invoked on completion of Phase + 1. The Phase 2 function is a separate process which completes when + it has no further work to do. The Phase 2 process shall consider all + routes that are present in the Adj-RIBs-In, including those received + from BGP speakers located in its own autonomous system and those + received from BGP speakers located in neighboring autonomous systems. + + + + + +Rekhter & Li [Page 36] + +RFC 1771 BGP-4 March 1995 + + + The Phase 2 decision function shall be blocked from running while the + Phase 3 decision function is in process. The Phase 2 function shall + lock all Adj-RIBs-In prior to commencing its function, and shall + unlock them on completion. + + If the NEXT_HOP attribute of a BGP route depicts an address to which + the local BGP speaker doesn't have a route in its Loc-RIB, the BGP + route SHOULD be excluded from the Phase 2 decision function. + + For each set of destinations for which a feasible route exists in the + Adj-RIBs-In, the local BGP speaker shall identify the route that has: + + a) the highest degree of preference of any route to the same set + of destinations, or + + b) is the only route to that destination, or + + c) is selected as a result of the Phase 2 tie breaking rules + specified in 9.1.2.1. + + The local speaker SHALL then install that route in the Loc-RIB, + replacing any route to the same destination that is currently being + held in the Loc-RIB. The local speaker MUST determine the immediate + next hop to the address depicted by the NEXT_HOP attribute of the + selected route by performing a lookup in the IGP and selecting one of + the possible paths in the IGP. This immediate next hop MUST be used + when installing the selected route in the Loc-RIB. If the route to + the address depicted by the NEXT_HOP attribute changes such that the + immediate next hop changes, route selection should be recalculated as + specified above. + + Unfeasible routes shall be removed from the Loc-RIB, and + corresponding unfeasible routes shall then be removed from the Adj- + RIBs-In. + +9.1.2.1 Breaking Ties (Phase 2) + + In its Adj-RIBs-In a BGP speaker may have several routes to the same + destination that have the same degree of preference. The local + speaker can select only one of these routes for inclusion in the + associated Loc-RIB. The local speaker considers all equally + preferable routes, both those received from BGP speakers located in + neighboring autonomous systems, and those received from other BGP + speakers located in the local speaker's autonomous system. + + The following tie-breaking procedure assumes that for each candidate + route all the BGP speakers within an autonomous system can ascertain + the cost of a path (interior distance) to the address depicted by the + + + +Rekhter & Li [Page 37] + +RFC 1771 BGP-4 March 1995 + + + NEXT_HOP attribute of the route. Ties shall be broken according to + the following algorithm: + + a) If the local system is configured to take into account + MULTI_EXIT_DISC, and the candidate routes differ in their + MULTI_EXIT_DISC attribute, select the route that has the lowest + value of the MULTI_EXIT_DISC attribute. + + b) Otherwise, select the route that has the lowest cost (interior + distance) to the entity depicted by the NEXT_HOP attribute of the + route. If there are several routes with the same cost, then the + tie-breaking shall be broken as follows: + + - if at least one of the candidate routes was advertised by the + BGP speaker in a neighboring autonomous system, select the + route that was advertised by the BGP speaker in a neighboring + autonomous system whose BGP Identifier has the lowest value + among all other BGP speakers in neighboring autonomous systems; + + - otherwise, select the route that was advertised by the BGP + speaker whose BGP Identifier has the lowest value. + +9.1.3 Phase 3: Route Dissemination + + The Phase 3 decision function shall be invoked on completion of Phase + 2, or when any of the following events occur: + + a) when routes in a Loc-RIB to local destinations have changed + + b) when locally generated routes learned by means outside of BGP + have changed + + c) when a new BGP speaker - BGP speaker connection has been + established + + The Phase 3 function is a separate process which completes when it + has no further work to do. The Phase 3 Routing Decision function + shall be blocked from running while the Phase 2 decision function is + in process. + + All routes in the Loc-RIB shall be processed into a corresponding + entry in the associated Adj-RIBs-Out. Route aggregation and + information reduction techniques (see 9.2.4.1) may optionally be + applied. + + For the benefit of future support of inter-AS multicast capabilities, + a BGP speaker that participates in inter-AS multicast routing shall + advertise a route it receives from one of its external peers and if + + + +Rekhter & Li [Page 38] + +RFC 1771 BGP-4 March 1995 + + + it installs it in its Loc-RIB, it shall advertise it back to the peer + from which the route was received. For a BGP speaker that does not + participate in inter-AS multicast routing such an advertisement is + optional. When doing such an advertisement, the NEXT_HOP attribute + should be set to the address of the peer. An implementation may also + optimize such an advertisement by truncating information in the + AS_PATH attribute to include only its own AS number and that of the + peer that advertised the route (such truncation requires the ORIGIN + attribute to be set to INCOMPLETE). In addition an implementation is + not required to pass optional or discretionary path attributes with + such an advertisement. + + When the updating of the Adj-RIBs-Out and the Forwarding Information + Base (FIB) is complete, the local BGP speaker shall run the external + update process of 9.2.2. + +9.1.4 Overlapping Routes + + A BGP speaker may transmit routes with overlapping Network Layer + Reachability Information (NLRI) to another BGP speaker. NLRI overlap + occurs when a set of destinations are identified in non-matching + multiple routes. Since BGP encodes NLRI using IP prefixes, overlap + will always exhibit subset relationships. A route describing a + smaller set of destinations (a longer prefix) is said to be more + specific than a route describing a larger set of destinations (a + shorted prefix); similarly, a route describing a larger set of + destinations (a shorter prefix) is said to be less specific than a + route describing a smaller set of destinations (a longer prefix). + + The precedence relationship effectively decomposes less specific + routes into two parts: + + - a set of destinations described only by the less specific + route, and + + - a set of destinations described by the overlap of the less + specific and the more specific routes + + When overlapping routes are present in the same Adj-RIB-In, the more + specific route shall take precedence, in order from more specific to + least specific. + + The set of destinations described by the overlap represents a portion + of the less specific route that is feasible, but is not currently in + use. If a more specific route is later withdrawn, the set of + destinations described by the overlap will still be reachable using + the less specific route. + + + + +Rekhter & Li [Page 39] + +RFC 1771 BGP-4 March 1995 + + + If a BGP speaker receives overlapping routes, the Decision Process + shall take into account the semantics of the overlapping routes. In + particular, if a BGP speaker accepts the less specific route while + rejecting the more specific route from the same peer, then the + destinations represented by the overlap may not forward along the ASs + listed in the AS_PATH attribute of that route. Therefore, a BGP + speaker has the following choices: + + a) Install both the less and the more specific routes + + b) Install the more specific route only + + c) Install the non-overlapping part of the less specific + route only (that implies de-aggregation) + + d) Aggregate the two routes and install the aggregated route + + e) Install the less specific route only + + f) Install neither route + + If a BGP speaker chooses e), then it should add ATOMIC_AGGREGATE + attribute to the route. A route that carries ATOMIC_AGGREGATE + attribute can not be de-aggregated. That is, the NLRI of this route + can not be made more specific. Forwarding along such a route does + not guarantee that IP packets will actually traverse only ASs listed + in the AS_PATH attribute of the route. If a BGP speaker chooses a), + it must not advertise the more general route without the more + specific route. + +9.2 Update-Send Process + + The Update-Send process is responsible for advertising UPDATE + messages to all peers. For example, it distributes the routes chosen + by the Decision Process to other BGP speakers which may be located in + either the same autonomous system or a neighboring autonomous system. + rules for information exchange between BGP speakers located in + different autonomous systems are given in 9.2.2; rules for + information exchange between BGP speakers located in the same + autonomous system are given in 9.2.1. + + Distribution of routing information between a set of BGP speakers, + all of which are located in the same autonomous system, is referred + to as internal distribution. + + + + + + + +Rekhter & Li [Page 40] + +RFC 1771 BGP-4 March 1995 + + +9.2.1 Internal Updates + + The Internal update process is concerned with the distribution of + routing information to BGP speakers located in the local speaker's + autonomous system. + + When a BGP speaker receives an UPDATE message from another BGP + speaker located in its own autonomous system, the receiving BGP + speaker shall not re-distribute the routing information contained in + that UPDATE message to other BGP speakers located in its own + autonomous system. + + When a BGP speaker receives a new route from a BGP speaker in a + neighboring autonomous system, it shall advertise that route to all + other BGP speakers in its autonomous system by means of an UPDATE + message if any of the following conditions occur: + + 1) the degree of preference assigned to the newly received route + by the local BGP speaker is higher than the degree of preference + that the local speaker has assigned to other routes that have been + received from BGP speakers in neighboring autonomous systems, or + + 2) there are no other routes that have been received from BGP + speakers in neighboring autonomous systems, or + + 3) the newly received route is selected as a result of breaking a + tie between several routes which have the highest degree of + preference, and the same destination (the tie-breaking procedure + is specified in 9.2.1.1). + + When a BGP speaker receives an UPDATE message with a non-empty + WITHDRAWN ROUTES field, it shall remove from its Adj-RIB-In all + routes whose destinations was carried in this field (as IP prefixes). + The speaker shall take the following additional steps: + + 1) if the corresponding feasible route had not been previously + advertised, then no further action is necessary + + 2) if the corresponding feasible route had been previously + advertised, then: + + i) if a new route is selected for advertisement that has the + same Network Layer Reachability Information as the unfeasible + routes, then the local BGP speaker shall advertise the + replacement route + + ii) if a replacement route is not available for advertisement, + then the BGP speaker shall include the destinations of the + + + +Rekhter & Li [Page 41] + +RFC 1771 BGP-4 March 1995 + + + unfeasible route (in form of IP prefixes) in the WITHDRAWN + ROUTES field of an UPDATE message, and shall send this message + to each peer to whom it had previously advertised the + corresponding feasible route. + + All feasible routes which are advertised shall be placed in the + appropriate Adj-RIBs-Out, and all unfeasible routes which are + advertised shall be removed from the Adj-RIBs-Out. + +9.2.1.1 Breaking Ties (Internal Updates) + + If a local BGP speaker has connections to several BGP speakers in + neighboring autonomous systems, there will be multiple Adj-RIBs-In + associated with these peers. These Adj-RIBs-In might contain several + equally preferable routes to the same destination, all of which were + advertised by BGP speakers located in neighboring autonomous systems. + The local BGP speaker shall select one of these routes according to + the following rules: + + a) If the candidate route differ only in their NEXT_HOP and + MULTI_EXIT_DISC attributes, and the local system is configured to + take into account MULTI_EXIT_DISC attribute, select the routes + that has the lowest value of the MULTI_EXIT_DISC attribute. + + b) If the local system can ascertain the cost of a path to the + entity depicted by the NEXT_HOP attribute of the candidate route, + select the route with the lowest cost. + + c) In all other cases, select the route that was advertised by the + BGP speaker whose BGP Identifier has the lowest value. + +9.2.2 External Updates + + The external update process is concerned with the distribution of + routing information to BGP speakers located in neighboring autonomous + systems. As part of Phase 3 route selection process, the BGP speaker + has updated its Adj-RIBs-Out and its Forwarding Table. All newly + installed routes and all newly unfeasible routes for which there is + no replacement route shall be advertised to BGP speakers located in + neighboring autonomous systems by means of UPDATE message. + + Any routes in the Loc-RIB marked as unfeasible shall be removed. + Changes to the reachable destinations within its own autonomous + system shall also be advertised in an UPDATE message. + + + + + + + +Rekhter & Li [Page 42] + +RFC 1771 BGP-4 March 1995 + + +9.2.3 Controlling Routing Traffic Overhead + + The BGP protocol constrains the amount of routing traffic (that is, + UPDATE messages) in order to limit both the link bandwidth needed to + advertise UPDATE messages and the processing power needed by the + Decision Process to digest the information contained in the UPDATE + messages. + +9.2.3.1 Frequency of Route Advertisement + + The parameter MinRouteAdvertisementInterval determines the minimum + amount of time that must elapse between advertisement of routes to a + particular destination from a single BGP speaker. This rate limiting + procedure applies on a per-destination basis, although the value of + MinRouteAdvertisementInterval is set on a per BGP peer basis. + + Two UPDATE messages sent from a single BGP speaker that advertise + feasible routes to some common set of destinations received from BGP + speakers in neighboring autonomous systems must be separated by at + least MinRouteAdvertisementInterval. Clearly, this can only be + achieved precisely by keeping a separate timer for each common set of + destinations. This would be unwarranted overhead. Any technique which + ensures that the interval between two UPDATE messages sent from a + single BGP speaker that advertise feasible routes to some common set + of destinations received from BGP speakers in neighboring autonomous + systems will be at least MinRouteAdvertisementInterval, and will also + ensure a constant upper bound on the interval is acceptable. + + Since fast convergence is needed within an autonomous system, this + procedure does not apply for routes receives from other BGP speakers + in the same autonomous system. To avoid long-lived black holes, the + procedure does not apply to the explicit withdrawal of unfeasible + routes (that is, routes whose destinations (expressed as IP prefixes) + are listed in the WITHDRAWN ROUTES field of an UPDATE message). + + This procedure does not limit the rate of route selection, but only + the rate of route advertisement. If new routes are selected multiple + times while awaiting the expiration of MinRouteAdvertisementInterval, + the last route selected shall be advertised at the end of + MinRouteAdvertisementInterval. + +9.2.3.2 Frequency of Route Origination + + The parameter MinASOriginationInterval determines the minimum amount + of time that must elapse between successive advertisements of UPDATE + messages that report changes within the advertising BGP speaker's own + autonomous systems. + + + + +Rekhter & Li [Page 43] + +RFC 1771 BGP-4 March 1995 + + +9.2.3.3 Jitter + + To minimize the likelihood that the distribution of BGP messages by a + given BGP speaker will contain peaks, jitter should be applied to the + timers associated with MinASOriginationInterval, Keepalive, and + MinRouteAdvertisementInterval. A given BGP speaker shall apply the + same jitter to each of these quantities regardless of the + destinations to which the updates are being sent; that is, jitter + will not be applied on a "per peer" basis. + + The amount of jitter to be introduced shall be determined by + multiplying the base value of the appropriate timer by a random + factor which is uniformly distributed in the range from 0.75 to 1.0. + +9.2.4 Efficient Organization of Routing Information + + Having selected the routing information which it will advertise, a + BGP speaker may avail itself of several methods to organize this + information in an efficient manner. + +9.2.4.1 Information Reduction + + Information reduction may imply a reduction in granularity of policy + control - after information is collapsed, the same policies will + apply to all destinations and paths in the equivalence class. + + The Decision Process may optionally reduce the amount of information + that it will place in the Adj-RIBs-Out by any of the following + methods: + + a) Network Layer Reachability Information (NLRI): + + Destination IP addresses can be represented as IP address + prefixes. In cases where there is a correspondence between the + address structure and the systems under control of an autonomous + system administrator, it will be possible to reduce the size of + the NLRI carried in the UPDATE messages. + + b) AS_PATHs: + + AS path information can be represented as ordered AS_SEQUENCEs or + unordered AS_SETs. AS_SETs are used in the route aggregation + algorithm described in 9.2.4.2. They reduce the size of the + AS_PATH information by listing each AS number only once, + regardless of how many times it may have appeared in multiple + AS_PATHs that were aggregated. + + + + + +Rekhter & Li [Page 44] + +RFC 1771 BGP-4 March 1995 + + + An AS_SET implies that the destinations listed in the NLRI can be + reached through paths that traverse at least some of the + constituent autonomous systems. AS_SETs provide sufficient + information to avoid routing information looping; however their + use may prune potentially feasible paths, since such paths are no + longer listed individually as in the form of AS_SEQUENCEs. In + practice this is not likely to be a problem, since once an IP + packet arrives at the edge of a group of autonomous systems, the + BGP speaker at that point is likely to have more detailed path + information and can distinguish individual paths to destinations. + +9.2.4.2 Aggregating Routing Information + + Aggregation is the process of combining the characteristics of + several different routes in such a way that a single route can be + advertised. Aggregation can occur as part of the decision process + to reduce the amount of routing information that will be placed in + the Adj-RIBs-Out. + + Aggregation reduces the amount of information that a BGP speaker must + store and exchange with other BGP speakers. Routes can be aggregated + by applying the following procedure separately to path attributes of + like type and to the Network Layer Reachability Information. + + Routes that have the following attributes shall not be aggregated + unless the corresponding attributes of each route are identical: + MULTI_EXIT_DISC, NEXT_HOP. + + Path attributes that have different type codes can not be aggregated + together. Path of the same type code may be aggregated, according to + the following rules: + + ORIGIN attribute: If at least one route among routes that are + aggregated has ORIGIN with the value INCOMPLETE, then the + aggregated route must have the ORIGIN attribute with the value + INCOMPLETE. Otherwise, if at least one route among routes that are + aggregated has ORIGIN with the value EGP, then the aggregated + route must have the origin attribute with the value EGP. In all + other case the value of the ORIGIN attribute of the aggregated + route is INTERNAL. + + AS_PATH attribute: If routes to be aggregated have identical + AS_PATH attributes, then the aggregated route has the same AS_PATH + attribute as each individual route. + + For the purpose of aggregating AS_PATH attributes we model each AS + within the AS_PATH attribute as a tuple <type, value>, where + "type" identifies a type of the path segment the AS belongs to + + + +Rekhter & Li [Page 45] + +RFC 1771 BGP-4 March 1995 + + + (e.g. AS_SEQUENCE, AS_SET), and "value" is the AS number. If the + routes to be aggregated have different AS_PATH attributes, then + the aggregated AS_PATH attribute shall satisfy all of the + following conditions: + + - all tuples of the type AS_SEQUENCE in the aggregated AS_PATH + shall appear in all of the AS_PATH in the initial set of routes + to be aggregated. + + - all tuples of the type AS_SET in the aggregated AS_PATH shall + appear in at least one of the AS_PATH in the initial set (they + may appear as either AS_SET or AS_SEQUENCE types). + + - for any tuple X of the type AS_SEQUENCE in the aggregated + AS_PATH which precedes tuple Y in the aggregated AS_PATH, X + precedes Y in each AS_PATH in the initial set which contains Y, + regardless of the type of Y. + + - No tuple with the same value shall appear more than once in + the aggregated AS_PATH, regardless of the tuple's type. + + An implementation may choose any algorithm which conforms to these + rules. At a minimum a conformant implementation shall be able to + perform the following algorithm that meets all of the above + conditions: + + - determine the longest leading sequence of tuples (as defined + above) common to all the AS_PATH attributes of the routes to be + aggregated. Make this sequence the leading sequence of the + aggregated AS_PATH attribute. + + - set the type of the rest of the tuples from the AS_PATH + attributes of the routes to be aggregated to AS_SET, and append + them to the aggregated AS_PATH attribute. + + - if the aggregated AS_PATH has more than one tuple with the + same value (regardless of tuple's type), eliminate all, but one + such tuple by deleting tuples of the type AS_SET from the + aggregated AS_PATH attribute. + + Appendix 6, section 6.8 presents another algorithm that satisfies + the conditions and allows for more complex policy configurations. + + ATOMIC_AGGREGATE: If at least one of the routes to be aggregated + has ATOMIC_AGGREGATE path attribute, then the aggregated route + shall have this attribute as well. + + + + + +Rekhter & Li [Page 46] + +RFC 1771 BGP-4 March 1995 + + + AGGREGATOR: All AGGREGATOR attributes of all routes to be + aggregated should be ignored. + +9.3 Route Selection Criteria + + Generally speaking, additional rules for comparing routes among + several alternatives are outside the scope of this document. There + are two exceptions: + + - If the local AS appears in the AS path of the new route being + considered, then that new route cannot be viewed as better than + any other route. If such a route were ever used, a routing loop + would result. + + - In order to achieve successful distributed operation, only + routes with a likelihood of stability can be chosen. Thus, an AS + must avoid using unstable routes, and it must not make rapid + spontaneous changes to its choice of route. Quantifying the terms + "unstable" and "rapid" in the previous sentence will require + experience, but the principle is clear. + +9.4 Originating BGP routes + + A BGP speaker may originate BGP routes by injecting routing + information acquired by some other means (e.g. via an IGP) into BGP. + A BGP speaker that originates BGP routes shall assign the degree of + preference to these routes by passing them through the Decision + Process (see Section 9.1). These routes may also be distributed to + other BGP speakers within the local AS as part of the Internal update + process (see Section 9.2.1). The decision whether to distribute non- + BGP acquired routes within an AS via BGP or not depends on the + environment within the AS (e.g. type of IGP) and should be controlled + via configuration. + + + + + + + + + + + + + + + + + + +Rekhter & Li [Page 47] + +RFC 1771 BGP-4 March 1995 + + +Appendix 1. BGP FSM State Transitions and Actions. + + This Appendix discusses the transitions between states in the BGP FSM + in response to BGP events. The following is the list of these states + and events when the negotiated Hold Time value is non-zero. + + BGP States: + + 1 - Idle + 2 - Connect + 3 - Active + 4 - OpenSent + 5 - OpenConfirm + 6 - Established + + BGP Events: + + 1 - BGP Start + 2 - BGP Stop + 3 - BGP Transport connection open + 4 - BGP Transport connection closed + 5 - BGP Transport connection open failed + 6 - BGP Transport fatal error + 7 - ConnectRetry timer expired + 8 - Hold Timer expired + 9 - KeepAlive timer expired + 10 - Receive OPEN message + 11 - Receive KEEPALIVE message + 12 - Receive UPDATE messages + 13 - Receive NOTIFICATION message + + + + + + + + + + + + + + + + + + + + + +Rekhter & Li [Page 48] + +RFC 1771 BGP-4 March 1995 + + + The following table describes the state transitions of the BGP FSM + and the actions triggered by these transitions. + + + Event Actions Message Sent Next State + -------------------------------------------------------------------- + Idle (1) + 1 Initialize resources none 2 + Start ConnectRetry timer + Initiate a transport connection + others none none 1 + + Connect(2) + 1 none none 2 + 3 Complete initialization OPEN 4 + Clear ConnectRetry timer + 5 Restart ConnectRetry timer none 3 + 7 Restart ConnectRetry timer none 2 + Initiate a transport connection + others Release resources none 1 + + Active (3) + 1 none none 3 + 3 Complete initialization OPEN 4 + Clear ConnectRetry timer + 5 Close connection 3 + Restart ConnectRetry timer + 7 Restart ConnectRetry timer none 2 + Initiate a transport connection + others Release resources none 1 + + OpenSent(4) + 1 none none 4 + 4 Close transport connection none 3 + Restart ConnectRetry timer + 6 Release resources none 1 + 10 Process OPEN is OK KEEPALIVE 5 + Process OPEN failed NOTIFICATION 1 + others Close transport connection NOTIFICATION 1 + Release resources + + + + + + + + + + + +Rekhter & Li [Page 49] + +RFC 1771 BGP-4 March 1995 + + + OpenConfirm (5) + 1 none none 5 + 4 Release resources none 1 + 6 Release resources none 1 + 9 Restart KeepAlive timer KEEPALIVE 5 + 11 Complete initialization none 6 + Restart Hold Timer + 13 Close transport connection 1 + Release resources + others Close transport connection NOTIFICATION 1 + Release resources + + Established (6) + 1 none none 6 + 4 Release resources none 1 + 6 Release resources none 1 + 9 Restart KeepAlive timer KEEPALIVE 6 + 11 Restart Hold Timer KEEPALIVE 6 + 12 Process UPDATE is OK UPDATE 6 + Process UPDATE failed NOTIFICATION 1 + 13 Close transport connection 1 + Release resources + others Close transport connection NOTIFICATION 1 + Release resources + --------------------------------------------------------------------- + + + + + + + + + + + + + + + + + + + + + + + + + + +Rekhter & Li [Page 50] + +RFC 1771 BGP-4 March 1995 + + + The following is a condensed version of the above state transition + table. + + + Events| Idle | Connect | Active | OpenSent | OpenConfirm | Estab + | (1) | (2) | (3) | (4) | (5) | (6) + |-------------------------------------------------------------- + 1 | 2 | 2 | 3 | 4 | 5 | 6 + | | | | | | + 2 | 1 | 1 | 1 | 1 | 1 | 1 + | | | | | | + 3 | 1 | 4 | 4 | 1 | 1 | 1 + | | | | | | + 4 | 1 | 1 | 1 | 3 | 1 | 1 + | | | | | | + 5 | 1 | 3 | 3 | 1 | 1 | 1 + | | | | | | + 6 | 1 | 1 | 1 | 1 | 1 | 1 + | | | | | | + 7 | 1 | 2 | 2 | 1 | 1 | 1 + | | | | | | + 8 | 1 | 1 | 1 | 1 | 1 | 1 + | | | | | | + 9 | 1 | 1 | 1 | 1 | 5 | 6 + | | | | | | + 10 | 1 | 1 | 1 | 1 or 5 | 1 | 1 + | | | | | | + 11 | 1 | 1 | 1 | 1 | 6 | 6 + | | | | | | + 12 | 1 | 1 | 1 | 1 | 1 | 1 or 6 + | | | | | | + 13 | 1 | 1 | 1 | 1 | 1 | 1 + | | | | | | + --------------------------------------------------------------- + + +Appendix 2. Comparison with RFC1267 + + BGP-4 is capable of operating in an environment where a set of + reachable destinations may be expressed via a single IP prefix. The + concept of network classes, or subnetting is foreign to BGP-4. To + accommodate these capabilities BGP-4 changes semantics and encoding + associated with the AS_PATH attribute. New text has been added to + define semantics associated with IP prefixes. These abilities allow + BGP-4 to support the proposed supernetting scheme [9]. + + To simplify configuration this version introduces a new attribute, + LOCAL_PREF, that facilitates route selection procedures. + + + +Rekhter & Li [Page 51] + +RFC 1771 BGP-4 March 1995 + + + The INTER_AS_METRIC attribute has been renamed to be MULTI_EXIT_DISC. + A new attribute, ATOMIC_AGGREGATE, has been introduced to insure that + certain aggregates are not de-aggregated. Another new attribute, + AGGREGATOR, can be added to aggregate routes in order to advertise + which AS and which BGP speaker within that AS caused the aggregation. + + To insure that Hold Timers are symmetric, the Hold Time is now + negotiated on a per-connection basis. Hold Times of zero are now + supported. + +Appendix 3. Comparison with RFC 1163 + + All of the changes listed in Appendix 2, plus the following. + + To detect and recover from BGP connection collision, a new field (BGP + Identifier) has been added to the OPEN message. New text (Section + 6.8) has been added to specify the procedure for detecting and + recovering from collision. + + The new document no longer restricts the border router that is passed + in the NEXT_HOP path attribute to be part of the same Autonomous + System as the BGP Speaker. + + New document optimizes and simplifies the exchange of the information + about previously reachable routes. + +Appendix 4. Comparison with RFC 1105 + + All of the changes listed in Appendices 2 and 3, plus the following. + + Minor changes to the RFC1105 Finite State Machine were necessary to + accommodate the TCP user interface provided by 4.3 BSD. + + The notion of Up/Down/Horizontal relations present in RFC1105 has + been removed from the protocol. + + The changes in the message format from RFC1105 are as follows: + + 1. The Hold Time field has been removed from the BGP header and + added to the OPEN message. + + 2. The version field has been removed from the BGP header and + added to the OPEN message. + + 3. The Link Type field has been removed from the OPEN message. + + 4. The OPEN CONFIRM message has been eliminated and replaced with + implicit confirmation provided by the KEEPALIVE message. + + + +Rekhter & Li [Page 52] + +RFC 1771 BGP-4 March 1995 + + + 5. The format of the UPDATE message has been changed + significantly. New fields were added to the UPDATE message to + support multiple path attributes. + + 6. The Marker field has been expanded and its role broadened to + support authentication. + + Note that quite often BGP, as specified in RFC 1105, is referred + to as BGP-1, BGP, as specified in RFC 1163, is referred to as + BGP-2, BGP, as specified in RFC1267 is referred to as BGP-3, and + BGP, as specified in this document is referred to as BGP-4. + +Appendix 5. TCP options that may be used with BGP + + If a local system TCP user interface supports TCP PUSH function, then + each BGP message should be transmitted with PUSH flag set. Setting + PUSH flag forces BGP messages to be transmitted promptly to the + receiver. + + If a local system TCP user interface supports setting precedence for + TCP connection, then the BGP transport connection should be opened + with precedence set to Internetwork Control (110) value (see also + [6]). + +Appendix 6. Implementation Recommendations + + This section presents some implementation recommendations. + +6.1 Multiple Networks Per Message + + The BGP protocol allows for multiple address prefixes with the same + AS path and next-hop gateway to be specified in one message. Making + use of this capability is highly recommended. With one address prefix + per message there is a substantial increase in overhead in the + receiver. Not only does the system overhead increase due to the + reception of multiple messages, but the overhead of scanning the + routing table for updates to BGP peers and other routing protocols + (and sending the associated messages) is incurred multiple times as + well. One method of building messages containing many address + prefixes per AS path and gateway from a routing table that is not + organized per AS path is to build many messages as the routing table + is scanned. As each address prefix is processed, a message for the + associated AS path and gateway is allocated, if it does not exist, + and the new address prefix is added to it. If such a message exists, + the new address prefix is just appended to it. If the message lacks + the space to hold the new address prefix, it is transmitted, a new + message is allocated, and the new address prefix is inserted into the + new message. When the entire routing table has been scanned, all + + + +Rekhter & Li [Page 53] + +RFC 1771 BGP-4 March 1995 + + + allocated messages are sent and their resources released. Maximum + compression is achieved when all the destinations covered by the + address prefixes share a gateway and common path attributes, making + it possible to send many address prefixes in one 4096-byte message. + + When peering with a BGP implementation that does not compress + multiple address prefixes into one message, it may be necessary to + take steps to reduce the overhead from the flood of data received + when a peer is acquired or a significant network topology change + occurs. One method of doing this is to limit the rate of updates. + This will eliminate the redundant scanning of the routing table to + provide flash updates for BGP peers and other routing protocols. A + disadvantage of this approach is that it increases the propagation + latency of routing information. By choosing a minimum flash update + interval that is not much greater than the time it takes to process + the multiple messages this latency should be minimized. A better + method would be to read all received messages before sending updates. + +6.2 Processing Messages on a Stream Protocol + + BGP uses TCP as a transport mechanism. Due to the stream nature of + TCP, all the data for received messages does not necessarily arrive + at the same time. This can make it difficult to process the data as + messages, especially on systems such as BSD Unix where it is not + possible to determine how much data has been received but not yet + processed. + + One method that can be used in this situation is to first try to read + just the message header. For the KEEPALIVE message type, this is a + complete message; for other message types, the header should first be + verified, in particular the total length. If all checks are + successful, the specified length, minus the size of the message + header is the amount of data left to read. An implementation that + would "hang" the routing information process while trying to read + from a peer could set up a message buffer (4096 bytes) per peer and + fill it with data as available until a complete message has been + received. + +6.3 Reducing route flapping + + To avoid excessive route flapping a BGP speaker which needs to + withdraw a destination and send an update about a more specific or + less specific route shall combine them into the same UPDATE message. + + + + + + + + +Rekhter & Li [Page 54] + +RFC 1771 BGP-4 March 1995 + + +6.4 BGP Timers + + BGP employs five timers: ConnectRetry, Hold Time, KeepAlive, + MinASOriginationInterval, and MinRouteAdvertisementInterval The + suggested value for the ConnectRetry timer is 120 seconds. The + suggested value for the Hold Time is 90 seconds. The suggested value + for the KeepAlive timer is 30 seconds. The suggested value for the + MinASOriginationInterval is 15 seconds. The suggested value for the + MinRouteAdvertisementInterval is 30 seconds. + + An implementation of BGP MUST allow these timers to be configurable. + +6.5 Path attribute ordering + + Implementations which combine update messages as described above in + 6.1 may prefer to see all path attributes presented in a known order. + This permits them to quickly identify sets of attributes from + different update messages which are semantically identical. To + facilitate this, it is a useful optimization to order the path + attributes according to type code. This optimization is entirely + optional. + +6.6 AS_SET sorting + + Another useful optimization that can be done to simplify this + situation is to sort the AS numbers found in an AS_SET. This + optimization is entirely optional. + +6.7 Control over version negotiation + + Since BGP-4 is capable of carrying aggregated routes which cannot be + properly represented in BGP-3, an implementation which supports BGP-4 + and another BGP version should provide the capability to only speak + BGP-4 on a per-peer basis. + +6.8 Complex AS_PATH aggregation + + An implementation which chooses to provide a path aggregation + algorithm which retains significant amounts of path information may + wish to use the following procedure: + + For the purpose of aggregating AS_PATH attributes of two routes, + we model each AS as a tuple <type, value>, where "type" identifies + a type of the path segment the AS belongs to (e.g. AS_SEQUENCE, + AS_SET), and "value" is the AS number. Two ASs are said to be the + same if their corresponding <type, value> tuples are the same. + + + + + +Rekhter & Li [Page 55] + +RFC 1771 BGP-4 March 1995 + + + The algorithm to aggregate two AS_PATH attributes works as + follows: + + a) Identify the same ASs (as defined above) within each AS_PATH + attribute that are in the same relative order within both + AS_PATH attributes. Two ASs, X and Y, are said to be in the + same order if either: + + - X precedes Y in both AS_PATH attributes, or - Y precedes X + in both AS_PATH attributes. + + b) The aggregated AS_PATH attribute consists of ASs identified + in (a) in exactly the same order as they appear in the AS_PATH + attributes to be aggregated. If two consecutive ASs identified + in (a) do not immediately follow each other in both of the + AS_PATH attributes to be aggregated, then the intervening ASs + (ASs that are between the two consecutive ASs that are the + same) in both attributes are combined into an AS_SET path + segment that consists of the intervening ASs from both AS_PATH + attributes; this segment is then placed in between the two + consecutive ASs identified in (a) of the aggregated attribute. + If two consecutive ASs identified in (a) immediately follow + each other in one attribute, but do not follow in another, then + the intervening ASs of the latter are combined into an AS_SET + path segment; this segment is then placed in between the two + consecutive ASs identified in (a) of the aggregated attribute. + + If as a result of the above procedure a given AS number appears + more than once within the aggregated AS_PATH attribute, all, but + the last instance (rightmost occurrence) of that AS number should + be removed from the aggregated AS_PATH attribute. + +References + + [1] Mills, D., "Exterior Gateway Protocol Formal Specification", RFC + 904, BBN, April 1984. + + [2] Rekhter, Y., "EGP and Policy Based Routing in the New NSFNET + Backbone", RFC 1092, T.J. Watson Research Center, February 1989. + + [3] Braun, H-W., "The NSFNET Routing Architecture", RFC 1093, + MERIT/NSFNET Project, February 1989. + + [4] Postel, J., "Transmission Control Protocol - DARPA Internet + Program Protocol Specification", STD 7, RFC 793, DARPA, September + 1981. + + + + + +Rekhter & Li [Page 56] + +RFC 1771 BGP-4 March 1995 + + + [5] Rekhter, Y., and P. Gross, "Application of the Border Gateway + Protocol in the Internet", RFC 1772, T.J. Watson Research Center, + IBM Corp., MCI, March 1995. + + [6] Postel, J., "Internet Protocol - DARPA Internet Program Protocol + Specification", STD 5, RFC 791, DARPA, September 1981. + + [7] "Information Processing Systems - Telecommunications and + Information Exchange between Systems - Protocol for Exchange of + Inter-domain Routeing Information among Intermediate Systems to + Support Forwarding of ISO 8473 PDUs", ISO/IEC IS10747, 1993 + + [8] Fuller, V., Li, T., Yu, J., and K. Varadhan, "Classless Inter- + Domain Routing (CIDR): an Address Assignment and Aggregation + Strategy", RFC 1519, BARRNet, cisco, MERIT, OARnet, September + 1993 + + [9] Rekhter, Y., Li, T., "An Architecture for IP Address Allocation + with CIDR", RFC 1518, T.J. Watson Research Center, cisco, + September 1993 + +Security Considerations + + Security issues are not discussed in this document. + +Editors' Addresses + + Yakov Rekhter + T.J. Watson Research Center IBM Corporation + P.O. Box 704, Office H3-D40 + Yorktown Heights, NY 10598 + + Phone: +1 914 784 7361 + EMail: yakov@watson.ibm.com + + + Tony Li + cisco Systems, Inc. + 170 W. Tasman Dr. + San Jose, CA 95134 + + EMail: tli@cisco.com + + + + + + + + + +Rekhter & Li [Page 57] + diff --git a/doc/rfc4360.txt b/doc/rfc4360.txt @@ -0,0 +1,675 @@ + + + + + + +Network Working Group S. Sangli +Request for Comments: 4360 D. Tappan +Category: Standards Track Cisco Systems + Y. Rekhter + Juniper Networks + February 2006 + + + BGP Extended Communities Attribute + +Status of This Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2006). + +Abstract + + This document describes the "extended community" BGP-4 attribute. + This attribute provides a mechanism for labeling information carried + in BGP-4. These labels can be used to control the distribution of + this information, or for other applications. + +1. Introduction + + The Extended Community Attribute provides a mechanism for labeling + information carried in BGP-4 [BGP-4]. It provides two important + enhancements over the existing BGP Community Attribute [RFC1997]: + + - An extended range, ensuring that communities can be assigned for + a plethora of uses, without fear of overlap. + + - The addition of a Type field provides structure for the + community space. + + The addition of structure allows the usage of policy based on the + application for which the community value will be used. For example, + one can filter out all communities of a particular type, or allow + only certain values for a particular type of community. It also + allows one to specify whether a particular community is transitive or + non-transitive across an Autonomous System (AS) boundary. Without + structure, this can only be accomplished by explicitly enumerating + + + +Sangli, et al. Standards Track [Page 1] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + + all community values that will be denied or allowed and passed to BGP + speakers in neighboring ASes based on the transitive property. + +1.1. Specification of Requirements + + The key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", + "SHOULD", "SHOULD NOT", "RECOMMENDED", "MAY", and "OPTIONAL" in this + document are to be interpreted as described in RFC 2119 [RFC2119]. + +2. BGP Extended Communities Attribute + + The Extended Communities Attribute is a transitive optional BGP + attribute, with the Type Code 16. The attribute consists of a set of + "extended communities". All routes with the Extended Communities + attribute belong to the communities listed in the attribute. + + Each Extended Community is encoded as an 8-octet quantity, as + follows: + + - Type Field : 1 or 2 octets + - Value Field : Remaining octets + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Type high | Type low(*) | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ Value | + | | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + (*) Present for Extended types only, used for the Value field + otherwise. + + Type Field: + + Two classes of Type Field are introduced: Regular type and + Extended type. + + The size of Type Field for Regular types is 1 octet, and the + size of the Type Field for Extended types is 2 octets. + + The value of the high-order octet of the Type Field determines + if an extended community is a Regular type or an Extended type. + The class of a type (Regular or Extended) is not encoded in the + structure of the type itself. The class of a type is specified + in the document that defines the type and the IANA registry. + + + + + +Sangli, et al. Standards Track [Page 2] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + + The high-order octet of the Type Field is as shown below: + + 0 1 2 3 4 5 6 7 + +-+-+-+-+-+-+-+-+ + |I|T| | + +-+-+-+-+-+-+-+-+ + + I - IANA authority bit + + Value 0: IANA-assignable type using the "First Come First + Serve" policy + + Value 1: Part of this Type Field space is for IANA + assignable types using either the Standard Action or the + Early IANA Allocation policy. The rest of this Type + Field space is for Experimental use. + + T - Transitive bit + + Value 0: The community is transitive across ASes + + Value 1: The community is non-transitive across ASes + + Remaining 6 bits: Indicates the structure of the community + + Value Field: + + The encoding of the Value Field is dependent on the "type" of + the community as specified by the Type Field. + + Two extended communities are declared equal only when all 8 octets of + the community are equal. + + The two members in the tuple <Type, Value> should be enumerated to + specify any community value. The remaining octets of the community + interpreted based on the value of the Type field. + +3. Defined BGP Extended Community Types + + This section introduces a few extended types and defines the format + of the Value Field for those types. The types introduced here + provide "templates", where each template is identified by the high- + order octet of the extended community Type field, and the lower-order + octet (sub-type) is used to indicate a particular type of extended + community. + + + + + + +Sangli, et al. Standards Track [Page 3] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + +3.1. Two-Octet AS Specific Extended Community + + This is an extended type with Type Field composed of 2 octets and + Value Field composed of 6 octets. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0x00 or 0x40 | Sub-Type | Global Administrator | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Local Administrator | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The value of the high-order octet of this extended type is either + 0x00 or 0x40. The low-order octet of this extended type is used to + indicate sub-types. + + The Value Field consists of two sub-fields: + + Global Administrator sub-field: 2 octets + + This sub-field contains an Autonomous System number assigned by + IANA. + + Local Administrator sub-field: 4 octets + + The organization identified by Autonomous System number in the + Global Administrator sub-field can encode any information in + this sub-field. The format and meaning of the value encoded in + this sub-field should be defined by the sub-type of the + community. + +3.2. IPv4 Address Specific Extended Community + + This is an extended type with Type Field composed of 2 octets and + Value Field composed of 6 octets. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0x01 or 0x41 | Sub-Type | Global Administrator | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Global Administrator (cont.) | Local Administrator | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The value of the high-order octet of this extended type is either + 0x01 or 0x41. The low-order octet of this extended type is used to + indicate sub-types. + + + +Sangli, et al. Standards Track [Page 4] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + + The Value field consists of two sub-fields: + + Global Administrator sub-field: 4 octets + + This sub-field contains an IPv4 unicast address assigned by one + of the Internet registries. + + Local Administrator sub-field: 2 octets + + The organization that has been assigned the IPv4 address in the + Global Administrator sub-field can encode any information in + this sub-field. The format and meaning of this value encoded + in this sub-field should be defined by the sub-type of the + community. + +3.3. Opaque Extended Community + + This is an extended type with Type Field composed of 2 octets and + Value Field composed of 6 octets. + + 0 1 2 3 + 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | 0x03 or 0x43 | Sub-Type | Value | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + | Value (cont.) | + +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ + + The value of the high-order octet of this extended type is either + 0x03 or 0x43. The low-order octet of this extended type is used to + indicate sub-types. + + This is a generic community of extended type. The value of the sub- + type that should define the Value Field is to be assigned by IANA. + +4. Route Target Community + + The Route Target Community identifies one or more routers that may + receive a set of routes (that carry this Community) carried by BGP. + This is transitive across the Autonomous System boundary. + + The Route Target Community is of an extended type. + + The value of the high-order octet of the Type field for the Route + Target Community can be 0x00, 0x01, or 0x02. The value of the low- + order octet of the Type field for this community is 0x02. + + + + + +Sangli, et al. Standards Track [Page 5] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + + When the value of the high-order octet of the Type field is 0x00 or + 0x02, the Local Administrator sub-field contains a number from a + numbering space that is administered by the organization to which the + Autonomous System number carried in the Global Administrator sub- + field has been assigned by an appropriate authority. + + When the value of the high-order octet of the Type field is 0x01, the + Local Administrator sub-field contains a number from a numbering + space that is administered by the organization to which the IP + address carried in the Global Administrator sub-field has been + assigned by an appropriate authority. + + One possible use of the Route Target Community is specified in + [RFC4364]. + +5. Route Origin Community + + The Route Origin Community identifies one or more routers that inject + a set of routes (that carry this Community) into BGP. This is + transitive across the Autonomous System boundary. + + The Route Origin Community is of an extended type. + + The value of the high-order octet of the Type field for the Route + Origin Community can be 0x00, 0x01, or 0x02. The value of the low- + order octet of the Type field for this community is 0x03. + + When the value of the high-order octet of the Type field is 0x00 or + 0x02, the Local Administrator sub-field contains a number from a + numbering space that is administered by the organization to which the + Autonomous System number carried in the Global Administrator sub- + field has been assigned by an appropriate authority. + + When the value of the high-order octet of the Type field is 0x01, the + Local Administrator sub-field contains a number from a numbering + space that is administered by the organization to which the IP + address carried in the Global Administrator sub-field has been + assigned by an appropriate authority. + + One possible use of the Route Origin Community is specified in + [RFC4364]. + + + + + + + + + + +Sangli, et al. Standards Track [Page 6] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + +6. Operations + + A BGP speaker may use the Extended Communities attribute to control + which routing information it accepts or distributes to its peers. + + The Extended Community attribute MUST NOT be used to modify the BGP + best path selection algorithm in a way that leads to forwarding + loops. + + A BGP speaker receiving a route that doesn't have the Extended + Communities attribute MAY append this attribute to the route when + propagating it to its peers. + + A BGP speaker receiving a route with the Extended Communities + attribute MAY modify this attribute according to the local policy. + + By default if a range of routes is to be aggregated and the resultant + aggregates path attributes do not carry the ATOMIC_AGGREGATE + attribute, then the resulting aggregate should have an Extended + Communities path attribute that contains the set union of all the + Extended Communities from all of the aggregated routes. The default + behavior could be overridden via local configuration, in which case + handling the Extended Communities attribute in the presence of route + aggregation becomes a matter of the local policy of the BGP speaker + that performs the aggregation. + + If a route has a non-transitivity extended community, then before + advertising the route across the Autonomous System boundary the + community SHOULD be removed from the route. However, the community + SHOULD NOT be removed when advertising the route across the BGP + Confederation boundary. + + A route may carry both the BGP Communities attribute, as defined in + [RFC1997]), and the Extended BGP Communities attribute. In this + case, the BGP Communities attribute is handled as specified in + [RFC1997], and the Extended BGP Communities attribute is handled as + specified in this document. + +7. IANA Considerations + + All the BGP Extended Communities contain a Type field. The IANA has + created a registry entitled, "BGP Extended Communities Type". The + IANA will maintain this registry. + + The Type could be either regular or extended. For a regular Type the + IANA allocates an 8-bit value; for an extended Type the IANA + allocates a 16-bit value. + + + + +Sangli, et al. Standards Track [Page 7] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + + The value allocated for a regular Type MUST NOT be reused as the + value of the high-order octet when allocating an extended Type. The + value of the high-order octet allocated for an extended Type MUST NOT + be reused when allocating a regular Type. + + The Type field indicates where the Extended Community is transitive + or not. Future requests for assignment of a Type value must specify + whether the Type value is intended for a transitive or a non- + transitive Extended Community. + + Future assignment are to be made using either the Standards Action + process defined in [RFC2434], the Early IANA Allocation process + defined in [RFC4020], or the "First Come First Served" policy defined + in [RFC2434]. + + The following table summarizes the ranges for the assignment of + Types: + + Type Standard Action First Come + Early IANA Allocation First Served + ------------------ --------------------- ------------ + + regular, transitive 0x90-0xbf 0x00-x3f + + regular, non-transitive 0xd0-0xff 0x40-0x7f + + extended, transitive 0x9000-0xbfff 0x0000-0x3fff + + extended, non-transitive 0xd000-0xffff 0x4000-0x7fff + + Assignments consist of a name and the value. + + The Type values 0x80-0x8f and 0xc0-0xcf for regular Types, and + 0x8000-0x8fff and 0xc000-0xcfff for extended Types are for + Experimental use as defined in RFC 3692. + + This document defines a class of extended communities called two- + octet AS specific extended community for which the IANA is to create + and maintain a registry entitled "Two-octet AS Specific Extended + Community". All the communities in this class are of extended Types. + Future assignment are to be made using the "First Come First Served" + policy defined in [RFC2434]. The Type values for the transitive + communities of the two-octet AS specific extended community class are + 0x0000-0x00ff, and for the non-transitive communities of that class + are 0x4000-0x40ff. Assignments consist of a name and the value. + + This document makes the following assignments for the two-octet AS + specific extended community: + + + +Sangli, et al. Standards Track [Page 8] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + + Name Type Value + ---- ---------- + two-octet AS specific Route Target 0x0002 + two-octet AS specific Route Origin 0x0003 + + This document defines a class of extended communities called IPv4 + address specific extended community for which the IANA is to create + and maintain a registry entitled "IPv4 Address Specific Extended + Community". All the communities in this class are of extended Types. + Future assignment are to be made using the "First Come First Served" + policy defined in [RFC2434]. The Type values for the transitive + communities of the two-octet AS specific extended community class + are 0x0100-0x01ff, and for the non-transitive communities of that + class are 0x4100-0x41ff. Assignments consist of a name and the + value. + + This document makes the following assignments for the IPv4 address + specific extended community: + + Name Type Value + ---- ---------- + IPv4 address specific Route Target 0x0102 + IPv4 address specific Route Origin 0x0103 + + This document defines a class of extended communities called opaque + extended community for which the IANA is to create and maintain a + registry entitled "Opaque Extended Community". All the communities + in this class are of extended Types. Future assignment are to be + made using the "First Come First Served" policy defined in [RFC2434]. + The Type values for the transitive communities of the opaque extended + community class are 0x0300-0x03ff, and for the non-transitive + communities of that class are 0x4300-0x43ff. Assignments consist of + a name and the value. + + When requesting an allocation from more than one registry defined + above, one may ask for allocating the same Type value from these + registries. If possible, the IANA should accommodate such requests. + +8. Security Considerations + + This extension to BGP has similar security implications as BGP + Communities [RFC1997]. + + This extension to BGP does not change the underlying security issues. + Specifically, an operator who is relying on the information carried + in BGP must have a transitive trust relationship back to the source + of the information. Specifying the mechanism(s) to provide such a + relationship is beyond the scope of this document. + + + +Sangli, et al. Standards Track [Page 9] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + +9. Acknowledgements + + The authors would like to thank John Hawkinson, Jeffrey Haas, Bruno + Rijsman, Bill Fenner, and Alex Zinin for their suggestions and + feedback. + +10. Normative References + + [BGP-4] Rekhter, Y. and T. Li, "A Border Gateway Protocol 4 + (BGP-4)", RFC 4271, January 2006. + + [RFC1997] Chandra, R., Traina, P., and T. Li, "BGP Communities + Attribute", RFC 1997, August 1996. + + [RFC2119] Bradner, S., "Key words for use in RFCs to Indicate + Requirement Levels", BCP 14, RFC 2119, March 1997. + + [RFC2434] Narten, T. and H. Alvestrand, "Guidelines for Writing + an IANA Considerations Section in RFCs", BCP 26, RFC + 2434, October 1998. + + [RFC4020] Kompella, K. and A. Zinin, "Early IANA Allocation of + Standards Track Code Points", BCP 100, RFC 4020, + February 2005. + +11. Informative References + + [RFC4364] Rosen, E. and Y. Rekhter, "BGP/MPLS IP Virtual Private + Networks (VPNs)", RFC 4364, February 2006. + + + + + + + + + + + + + + + + + + + + + + +Sangli, et al. Standards Track [Page 10] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + +Authors' Addresses + + Srihari R. Sangli + Cisco Systems, Inc. + + EMail: rsrihari@cisco.com + + + Dan Tappan + Cisco Systems, Inc. + 250 Apollo Drive + Chelmsford, MA 01824 + + EMail: tappan@cisco.com + + + Yakov Rekhter + Juniper Networks, Inc. + 1194 N. Mathilda Ave + Sunnyvale, CA 94089 + + EMail: yakov@juniper.net + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +Sangli, et al. Standards Track [Page 11] + +RFC 4360 BGP Extended Communities Attribute February 2006 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2006). + + This document is subject to the rights, licenses and restrictions + contained in BCP 78, and except as set forth therein, the authors + retain all their rights. + + This document and the information contained herein are provided on an + "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE REPRESENTS + OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE INTERNET + ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE + INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED + WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Intellectual Property + + The IETF takes no position regarding the validity or scope of any + Intellectual Property Rights or other rights that might be claimed to + pertain to the implementation or use of the technology described in + this document or the extent to which any license under such rights + might or might not be available; nor does it represent that it has + made any independent effort to identify any such rights. Information + on the procedures with respect to rights in RFC documents can be + found in BCP 78 and BCP 79. + + Copies of IPR disclosures made to the IETF Secretariat and any + assurances of licenses to be made available, or the result of an + attempt made to obtain a general license or permission for the use of + such proprietary rights by implementers or users of this + specification can be obtained from the IETF on-line IPR repository at + http://www.ietf.org/ipr. + + The IETF invites any interested party to bring to its attention any + copyrights, patents or patent applications, or other proprietary + rights that may cover technology that may be required to implement + this standard. Please address the information to the IETF at + ietf-ipr@ietf.org. + +Acknowledgement + + Funding for the RFC Editor function is provided by the IETF + Administrative Support Activity (IASA). + + + + + + + +Sangli, et al. Standards Track [Page 12] + diff --git a/doc/rfc4364.txt b/doc/rfc4364.txt @@ -0,0 +1,2635 @@ + + + + + + +Network Working Group E. Rosen +Request for Comments: 4364 Cisco Systems, Inc. +Obsoletes: 2547 Y. Rekhter +Category: Standards Track Juniper Networks, Inc. + February 2006 + + + BGP/MPLS IP Virtual Private Networks (VPNs) + +Status of This Memo + + This document specifies an Internet standards track protocol for the + Internet community, and requests discussion and suggestions for + improvements. Please refer to the current edition of the "Internet + Official Protocol Standards" (STD 1) for the standardization state + and status of this protocol. Distribution of this memo is unlimited. + +Copyright Notice + + Copyright (C) The Internet Society (2006). + +Abstract + + This document describes a method by which a Service Provider may use + an IP backbone to provide IP Virtual Private Networks (VPNs) for its + customers. This method uses a "peer model", in which the customers' + edge routers (CE routers) send their routes to the Service Provider's + edge routers (PE routers); there is no "overlay" visible to the + customer's routing algorithm, and CE routers at different sites do + not peer with each other. Data packets are tunneled through the + backbone, so that the core routers do not need to know the VPN + routes. + + This document obsoletes RFC 2547. + + + + + + + + + + + + + + + + + +Rosen & Rekhter Standards Track [Page 1] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +Table of Contents + + 1. Introduction ....................................................3 + 1.1. Virtual Private Networks ...................................4 + 1.2. Customer Edge and Provider Edge ............................5 + 1.3. VPNs with Overlapping Address Spaces .......................6 + 1.4. VPNs with Different Routes to the Same System ..............7 + 1.5. SP Backbone Routers ........................................7 + 1.6. Security ...................................................8 + 2. Sites and CEs ...................................................8 + 3. VRFs: Multiple Forwarding Tables in PEs .........................9 + 3.1. VRFs and Attachment Circuits ...............................9 + 3.2. Associating IP Packets with VRFs ..........................10 + 3.3. Populating the VRFs .......................................11 + 4. VPN Route Distribution via BGP .................................12 + 4.1. The VPN-IPv4 Address Family ...............................13 + 4.2. Encoding of Route Distinguishers ..........................14 + 4.3. Controlling Route Distribution ............................15 + 4.3.1. The Route Target Attribute .........................15 + 4.3.2. Route Distribution Among PEs by BGP ................17 + 4.3.3. Use of Route Reflectors ............................20 + 4.3.4. How VPN-IPv4 NLRI Is Carried in BGP ................22 + 4.3.5. Building VPNs Using Route Targets ..................23 + 4.3.6. Route Distribution Among VRFs in a Single PE .......23 + 5. Forwarding .....................................................23 + 6. Maintaining Proper Isolation of VPNs ...........................26 + 7. How PEs Learn Routes from CEs ..................................27 + 8. How CEs Learn Routes from PEs ..................................30 + 9. Carriers' Carriers .............................................30 + 10. Multi-AS Backbones ............................................32 + 11. Accessing the Internet from a VPN .............................34 + 12. Management VPNs ...............................................36 + 13. Security Considerations .......................................37 + 13.1. Data Plane ...............................................37 + 13.2. Control Plane ............................................39 + 13.3. Security of P and PE Devices .............................39 + 14. Quality of Service ............................................39 + 15. Scalability ...................................................40 + 16. IANA Considerations ...........................................40 + 17. Acknowledgements ..............................................41 + 18. Contributors ..................................................41 + 19. Normative References ..........................................44 + 20. Informative References ........................................45 + + + + + + + + +Rosen & Rekhter Standards Track [Page 2] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +1. Introduction + + This document describes a method by which a Service Provider may use + an IP backbone to provide IP Virtual Private Networks (VPNs) for its + customers. This method uses a "peer model", in which the customers' + edge routers (CE routers) send their routes to the Service Provider's + edge routers (PE routers). Border Gateway Protocol (BGP) + [BGP, BGP-MP] is then used by the Service Provider to exchange the + routes of a particular VPN among the PE routers that are attached to + that VPN. This is done in a way that ensures that routes from + different VPNs remain distinct and separate, even if two VPNs have an + overlapping address space. The PE routers distribute, to the CE + routers in a particular VPN, the routes from other the CE routers in + that VPN. The CE routers do not peer with each other, hence there is + no "overlay" visible to the VPN's routing algorithm. The term "IP" + in "IP VPN" is used to indicate that the PE receives IP datagrams + from the CE, examines their IP headers, and routes them accordingly. + + Each route within a VPN is assigned a Multiprotocol Label Switching + (MPLS) [MPLS-ARCH, MPLS-BGP, MPLS-ENCAPS] label; when BGP distributes + a VPN route, it also distributes an MPLS label for that route. + Before a customer data packet travels across the Service Provider's + backbone, it is encapsulated with the MPLS label that corresponds, in + the customer's VPN, to the route that is the best match to the + packet's destination address. This MPLS packet is further + encapsulated (e.g., with another MPLS label or with an IP or Generic + Routing Encapsulation (GRE) tunnel header [MPLS-in-IP-GRE]) so that + it gets tunneled across the backbone to the proper PE router. Thus, + the backbone core routers do not need to know the VPN routes. + + The primary goal of this method is to support the case in which a + client obtains IP backbone services from a Service Provider or + Service Providers with which it maintains contractual relationships. + The client may be an enterprise, a group of enterprises that need an + extranet, an Internet Service Provider, an application service + provider, another VPN Service Provider that uses this same method to + offer VPNs to clients of its own, etc. The method makes it very + simple for the client to use the backbone services. It is also very + scalable and flexible for the Service Provider, and allows the + Service Provider to add value. + + + + + + + + + + + +Rosen & Rekhter Standards Track [Page 3] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +1.1. Virtual Private Networks + + Consider a set of "sites" that are attached to a common network that + we call "the backbone". Now apply some policy to create a number of + subsets of that set, and impose the following rule: two sites may + have IP interconnectivity over that backbone only if at least one of + these subsets contains them both. + + These subsets are Virtual Private Networks (VPNs). Two sites have IP + connectivity over the common backbone only if there is some VPN that + contains them both. Two sites that have no VPN in common have no + connectivity over that backbone. + + If all the sites in a VPN are owned by the same enterprise, the VPN + may be thought of as a corporate "intranet". If the various sites in + a VPN are owned by different enterprises, the VPN may be thought of + as an "extranet". A site can be in more than one VPN; e.g., in an + intranet and in several extranets. In general, when we use the term + "VPN" we will not be distinguishing between intranets and extranets. + + We refer to the owners of the sites as the "customers". We refer to + the owners/operators of the backbone as the "Service Providers" + (SPs). The customers obtain "VPN service" from the SPs. + + A customer may be a single enterprise, a set of enterprises, an + Internet Service Provider, an Application Service Provider, another + SP that offers the same kind of VPN service to its own customers, + etc. + + The policies that determine whether a particular collection of sites + is a VPN are the policies of the customers. Some customers will want + the implementation of these policies to be entirely the + responsibility of the SP. Other customers may want to share with the + SP the responsibility for implementing these policies. This document + specifies mechanisms that can be used to implement these policies. + The mechanisms we describe are general enough to allow these policies + to be implemented either by the SP alone or by a VPN customer + together with the SP. Most of the discussion is focused on the + former case, however. + + The mechanisms discussed in this document allow the implementation of + a wide range of policies. For example, within a given VPN, one can + allow every site to have a direct route to every other site ("full + mesh"). Alternatively, one can force traffic between certain pairs + of sites to be routed via a third site. This can be useful, e.g., if + it is desired that traffic between a pair of sites be passed through + a firewall, and the firewall is located at the third site. + + + + +Rosen & Rekhter Standards Track [Page 4] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + In this document, we restrict our discussion to the case in which the + customer is explicitly purchasing VPN service from an SP, or from a + set of SPs that have agreed to cooperate to provide the VPN service. + That is, the customer is not merely purchasing internet access from + an SP, and the VPN traffic does not pass through a random collection + of interconnected SP networks. + + We also restrict our discussion to the case in which the backbone + provides an IP service to the customer, rather than, e.g., a layer 2 + service such as Frame Relay, Asynchronous Transfer Mode (ATM), + ethernet, High Level Data Link Control (HDLC), or Point-to-Point + Protocol (PPP). The customer may attach to the backbone via one of + these (or other) layer 2 services, but the layer 2 service is + terminated at the "edge" of the backbone, where the customer's IP + datagrams are removed from any layer 2 encapsulation. + + In the rest of this introduction, we specify some properties that + VPNs should have. The remainder of this document specifies a set of + mechanisms that can be deployed to provide a VPN model that has all + these properties. This section also introduces some of the technical + terminology used in the remainder of the document. + +1.2. Customer Edge and Provider Edge + + Routers can be attached to each other, or to end systems, in a + variety of different ways: PPP connections, ATM Virtual Circuits + (VCs), Frame Relay VCs, ethernet interfaces, Virtual Local Area + Networks (VLANs) on ethernet interfaces, GRE tunnels, Layer 2 + Tunneling Protocol (L2TP) tunnels, IPsec tunnels, etc. We will use + the term "attachment circuit" to refer generally to some such means + of attaching to a router. An attachment circuit may be the sort of + connection that is usually thought of as a "data link", or it may be + a tunnel of some sort; what matters is that it be possible for two + devices to be network layer peers over the attachment circuit. + + Each VPN site must contain one or more Customer Edge (CE) devices. + Each CE device is attached, via some sort of attachment circuit, to + one or more Provider Edge (PE) routers. + + Routers in the SP's network that do not attach to CE devices are + known as "P routers". + + CE devices can be hosts or routers. In a typical case, a site + contains one or more routers, some of which are attached to PE + routers. The site routers that attach to the PE routers would then + be the CE devices, or "CE routers". However, there is nothing to + prevent a non-routing host from attaching directly to a PE router, in + which case the host would be a CE device. + + + +Rosen & Rekhter Standards Track [Page 5] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Sometimes, what is physically attached to a PE router is a layer 2 + switch. In this case, we do NOT say that the layer 2 switch is a CE + device. Rather, the CE devices are the hosts and routers that + communicate with the PE router through the layer 2 switch; the layer + 2 infrastructure is transparent. If the layer 2 infrastructure + provides a multipoint service, then multiple CE devices can be + attached to the PE router over the same attachment circuit. + + CE devices are logically part of a customer's VPN. PE and P routers + are logically part of the SP's network. + + The attachment circuit over which a packet travels when going from CE + to PE is known as that packet's "ingress attachment circuit", and the + PE as the packet's "ingress PE". The attachment circuit over which a + packet travels when going from PE to CE is known as that packet's + "egress attachment circuit", and the PE as the packet's "egress PE". + + We will say that a PE router is attached to a particular VPN if it is + attached to a CE device that is in a site of that VPN. Similarly, we + will say that a PE router is attached to a particular site if it is + attached to a CE device that is in that site. + + When the CE device is a router, it is a routing peer of the PE(s) to + which it is attached, but it is NOT a routing peer of CE routers at + other sites. Routers at different sites do not directly exchange + routing information with each other; in fact, they do not even need + to know of each other at all. As a consequence, the customer has no + backbone or "virtual backbone" to manage, and does not have to deal + with any inter-site routing issues. In other words, in the scheme + described in this document, a VPN is NOT an "overlay" on top of the + SP's network. + + With respect to the management of the edge devices, clear + administrative boundaries are maintained between the SP and its + customers. Customers are not required to access the PE or P routers + for management purposes, nor is the SP required to access the CE + devices for management purposes. + +1.3. VPNs with Overlapping Address Spaces + + If two VPNs have no sites in common, then they may have overlapping + address spaces. That is, a given address might be used in VPN V1 as + the address of system S1, but in VPN V2 as the address of a + completely different system S2. This is a common situation when the + VPNs each use an RFC 1918 private address space. Of course, within + each VPN, each address must be unambiguous. + + + + + +Rosen & Rekhter Standards Track [Page 6] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Even two VPNs that do have sites in common may have overlapping + address spaces, as long as there is no need for any communication + between systems with such addresses and systems in the common sites. + +1.4. VPNs with Different Routes to the Same System + + Although a site may be in multiple VPNs, it is not necessarily the + case that the route to a given system at that site should be the same + in all the VPNs. Suppose, for example, we have an intranet + consisting of sites A, B, and C, and an extranet consisting of A, B, + C, and the "foreign" site D. Suppose that at site A there is a + server, and we want clients from B, C, or D to be able to use that + server. Suppose also that at site B there is a firewall. We want + all the traffic from site D to the server to pass through the + firewall, so that traffic from the extranet can be access controlled. + However, we don't want traffic from C to pass through the firewall on + the way to the server, since this is intranet traffic. + + It is possible to set up two routes to the server. One route, used + by sites B and C, takes the traffic directly to site A. The second + route, used by site D, takes the traffic instead to the firewall at + site B. If the firewall allows the traffic to pass, it then appears + to be traffic coming from site B, and follows the route to site A. + +1.5. SP Backbone Routers + + The SP's backbone consists of the PE routers, as well as other + routers ("P routers") that do not attach to CE devices. + + If every router in an SP's backbone had to maintain routing + information for all the VPNs supported by the SP, there would be + severe scalability problems; the number of sites that could be + supported would be limited by the amount of routing information that + could be held in a single router. It is important therefore that the + routing information about a particular VPN only needs to be present + in the PE routers that attach to that VPN. In particular, the P + routers do not need to have ANY per-VPN routing information + whatsoever. (This condition may need to be relaxed somewhat when + multicast routing is considered. This is not considered further in + this paper, but is examined in [VPN-MCAST].) + + So just as the VPN owners do not have a backbone or "virtual + backbone" to administer, the SPs themselves do not have a separate + backbone or "virtual backbone" to administer for each VPN. Site-to- + site routing in the backbone is optimal (within the constraints of + the policies used to form the VPNs) and is not constrained in any way + by an artificial "virtual topology" of tunnels. + + + + +Rosen & Rekhter Standards Track [Page 7] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Section 10 discusses some of the special issues that arise when the + backbone spans several Service Providers. + +1.6. Security + + VPNs of the sort being discussed here, even without making use of + cryptographic security measures, are intended to provide a level of + security equivalent to that obtainable when a layer 2 backbone (e.g., + Frame Relay) is used. That is, in the absence of misconfiguration or + deliberate interconnection of different VPNs, it is not possible for + systems in one VPN to gain access to systems in another VPN. Of + course, the methods described herein do not by themselves encrypt the + data for privacy, nor do they provide a way to determine whether data + has been tampered with en route. If this is desired, cryptographic + measures must be applied in addition. (See, e.g., [MPLS/BGP-IPsec].) + Security is discussed in more detail in Section 13. + +2. Sites and CEs + + From the perspective of a particular backbone network, a set of IP + systems may be regarded as a "site" if those systems have mutual IP + interconnectivity that doesn't require use of the backbone. In + general, a site will consist of a set of systems that are in + geographic proximity. However, this is not universally true. If two + geographic locations are connected via a leased line, over which Open + Shortest Path First (OSPF) protocol [OSPFv2] is running, and if that + line is the preferred way of communicating between the two locations, + then the two locations can be regarded as a single site, even if each + location has its own CE router. (This notion of "site" is + topological, rather than geographical. If the leased line goes down, + or otherwise ceases to be the preferred route, but the two geographic + locations can continue to communicate by using the VPN backbone, then + one site has become two.) + + A CE device is always regarded as being in a single site (though as + we shall see in Section 3.2, a site may consist of multiple "virtual + sites"). A site, however, may belong to multiple VPNs. + + A PE router may attach to CE devices from any number of different + sites, whether those CE devices are in the same or in different VPNs. + A CE device may, for robustness, attach to multiple PE routers, of + the same or of different service providers. If the CE device is a + router, the PE router and the CE router will appear as router + adjacencies to each other. + + While we speak mostly of "sites" as being the basic unit of + interconnection, nothing here prevents a finer degree of granularity + in the control of interconnectivity. For example, certain systems at + + + +Rosen & Rekhter Standards Track [Page 8] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + a site may be members of an intranet as well as members of one or + more extranets, while other systems at the same site may be + restricted to being members of the intranet only. However, this + might require that the site have two attachment circuits to the + backbone, one for the intranet and one for the extranet; it might + further require that firewall functionality be applied on the + extranet attachment circuit. + +3. VRFs: Multiple Forwarding Tables in PEs + + Each PE router maintains a number of separate forwarding tables. One + of the forwarding tables is the "default forwarding table". The + others are "VPN Routing and Forwarding tables", or "VRFs". + +3.1. VRFs and Attachment Circuits + + Every PE/CE attachment circuit is associated, by configuration, with + one or more VRFs. An attachment circuit that is associated with a + VRF is known as a "VRF attachment circuit". + + In the simplest case and most typical case, a PE/CE attachment + circuit is associated with exactly one VRF. When an IP packet is + received over a particular attachment circuit, its destination IP + address is looked up in the associated VRF. The result of that + lookup determines how to route the packet. The VRF used by a + packet's ingress PE for routing a particular packet is known as the + packet's "ingress VRF". (There is also the notion of a packet's + "egress VRF", located at the packet's egress PE; this is discussed in + Section 5.) + + If an IP packet arrives over an attachment circuit that is not + associated with any VRF, the packet's destination address is looked + up in the default forwarding table, and the packet is routed + accordingly. Packets forwarded according to the default forwarding + table include packets from neighboring P or PE routers, as well as + packets from customer-facing attachment circuits that have not been + associated with VRFs. + + Intuitively, one can think of the default forwarding table as + containing "public routes", and of the VRFs as containing "private + routes". One can similarly think of VRF attachment circuits as being + "private", and of non-VRF attachment circuits as being "public". + + If a particular VRF attachment circuit connects site S to a PE + router, then connectivity from S (via that attachment circuit) can be + restricted by controlling the set of routes that gets entered in the + corresponding VRF. The set of routes in that VRF should be limited + to the set of routes leading to sites that have at least one VPN in + + + +Rosen & Rekhter Standards Track [Page 9] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + common with S. Then a packet sent from S over a VRF attachment + circuit can only be routed by the PE to another site S' if S' is in + one of the same VPNs as S. That is, communication (via PE routers) + is prevented between any pair of VPN sites that have no VPN in + common. Communication between VPN sites and non-VPN sites is + prevented by keeping the routes to the VPN sites out of the default + forwarding table. + + If there are multiple attachment circuits leading from S to one or + more PE routers, then there might be multiple VRFs that could be used + to route traffic from S. To properly restrict S's connectivity, the + same set of routes would have to exist in all the VRFs. + Alternatively, one could impose different connectivity restrictions + over different attachment circuit from S. In that case, some of the + VRFs associated with attachment circuits from S would contain + different sets of routes than some of the others. + + We allow the case in which a single attachment circuit is associated + with a set of VRFs, rather than with a single VRF. This can be + useful if it is desired to divide a single VPN into several + "sub-VPNs", each with different connectivity restrictions, where some + characteristic of the customer packets is used to select from among + the sub-VPNs. For simplicity though, we will usually speak of an + attachment circuit as being associated with a single VRF. + +3.2. Associating IP Packets with VRFs + + When a PE router receives a packet from a CE device, it must + determine the attachment circuit over which the packet arrived, as + this determines in turn the VRF (or set of VRFs) that can be used for + forwarding that packet. In general, to determine the attachment + circuit over which a packet arrived, a PE router takes note of the + physical interface over which the packet arrived, and possibly also + takes note of some aspect of the packet's layer 2 header. For + example, if a packet's ingress attachment circuit is a Frame Relay + VC, the identity of the attachment circuit can be determined from the + physical Frame Relay interface over which the packet arrived, + together with the Data Link Connection Identifier (DLCI) field in the + packet's Frame Relay header. + + Although the PE's conclusion that a particular packet arrived on a + particular attachment circuit may be partially determined by the + packet's layer 2 header, it must be impossible for a customer, by + writing the header fields, to fool the SP into thinking that a packet + that was received over one attachment circuit really arrived over a + different one. In the example above, although the attachment circuit + is determined partially by inspection of the DLCI field in the Frame + Relay header, this field cannot be set freely by the customer. + + + +Rosen & Rekhter Standards Track [Page 10] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Rather, it must be set to a value specified by the SP, or else the + packet cannot arrive at the PE router. + + In some cases, a particular site may be divided by the customer into + several "virtual sites". The SP may designate a particular set of + VRFs to be used for routing packets from that site and may allow the + customer to set some characteristic of the packet, which is then used + for choosing a particular VRF from the set. + + For example, each virtual site might be realized as a VLAN. The SP + and the customer could agree that on packets arriving from a + particular CE, certain VLAN values would be used to identify certain + VRFs. Of course, packets from that CE would be discarded by the PE + if they carry VLAN tag values that are not in the agreed-upon set. + Another way to accomplish this is to use IP source addresses. In + this case, the PE uses the IP source address in a packet received + from the CE, along with the interface over which the packet is + received, to assign the packet to a particular VRF. Again, the + customer would only be able to select from among the particular set + of VRFs that that customer is allowed to use. + + If it is desired to have a particular host be in multiple virtual + sites, then that host must determine, for each packet, which virtual + site the packet is associated with. It can do this, e.g., by sending + packets from different virtual sites on different VLANs, or out + different network interfaces. + +3.3. Populating the VRFs + + With what set of routes are the VRFs populated? + + As an example, let PE1, PE2, and PE3 be three PE routers, and let + CE1, CE2, and CE3 be three CE routers. Suppose that PE1 learns, from + CE1, the routes that are reachable at CE1's site. If PE2 and PE3 are + attached, respectively, to CE2 and CE3, and there is some VPN V + containing CE1, CE2, and CE3, then PE1 uses BGP to distribute to PE2 + and PE3 the routes that it has learned from CE1. PE2 and PE3 use + these routes to populate the VRFs that they associate, respectively, + with the sites of CE2 and CE3. Routes from sites that are not in VPN + V do not appear in these VRFs, which means that packets from CE2 or + CE3 cannot be sent to sites that are not in VPN V. + + When we speak of a PE "learning" routes from a CE, we are not + presupposing any particular learning technique. The PE may learn + routes by means of a dynamic routing algorithm, but it may also + "learn" routes by having those routes configured (i.e., static + routing). (In this case, to say that the PE "learned" the routes + from the CE is perhaps to exercise a bit of poetic license.) + + + +Rosen & Rekhter Standards Track [Page 11] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + PEs also need to learn, from other PEs, the routes that belong to a + given VPN. The procedures to be used for populating the VRFs with + the proper sets of routes are specified in Section 4. + + If there are multiple attachment circuits leading from a particular + PE router to a particular site, they might all be mapped to the same + forwarding table. But if policy dictates, they could be mapped to + different forwarding tables. For instance, the policy might be that + a particular attachment circuit from a site is used only for intranet + traffic, while another attachment circuit from that site is used only + for extranet traffic. (Perhaps, e.g., the CE attached to the + extranet attachment circuit is a firewall, while the CE attached to + the intranet attachment circuit is not.) In this case, the two + attachment circuits would be associated with different VRFs. + + Note that if two attachment circuits are associated with the same + VRF, then packets that the PE receives over one of them will be able + to reach exactly the same set of destinations as packets that the PE + receives over the other. So two attachment circuits cannot be + associated with the same VRF unless each CE is in the exact same set + of VPNs as is the other. + + If an attachment circuit leads to a site which is in multiple VPNs, + the attachment circuit may still associated with a single VRF, in + which case the VRF will contain routes from the full set of VPNs of + which the site is a member. + +4. VPN Route Distribution via BGP + + PE routers use BGP to distribute VPN routes to each other (more + accurately, to cause VPN routes to be distributed to each other). + + We allow each VPN to have its own address space, which means that a + given address may denote different systems in different VPNs. If two + routes to the same IP address prefix are actually routes to different + systems, it is important to ensure that BGP not treat them as + comparable. Otherwise, BGP might choose to install only one of them, + making the other system unreachable. Further, we must ensure that + POLICY is used to determine which packets get sent on which routes; + given that several such routes are installed by BGP, only one such + must appear in any particular VRF. + + We meet these goals by the use of a new address family, as specified + below. + + + + + + + +Rosen & Rekhter Standards Track [Page 12] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +4.1. The VPN-IPv4 Address Family + + The BGP Multiprotocol Extensions [BGP-MP] allow BGP to carry routes + from multiple "address families". We introduce the notion of the + "VPN-IPv4 address family". A VPN-IPv4 address is a 12-byte quantity, + beginning with an 8-byte Route Distinguisher (RD) and ending with a + 4-byte IPv4 address. If several VPNs use the same IPv4 address + prefix, the PEs translate these into unique VPN-IPv4 address + prefixes. This ensures that if the same address is used in several + different VPNs, it is possible for BGP to carry several completely + different routes to that address, one for each VPN. + + Since VPN-IPv4 addresses and IPv4 addresses are different address + families, BGP never treats them as comparable addresses. + + An RD is simply a number, and it does not contain any inherent + information; it does not identify the origin of the route or the set + of VPNs to which the route is to be distributed. The purpose of the + RD is solely to allow one to create distinct routes to a common IPv4 + address prefix. Other means are used to determine where to + redistribute the route (see Section 4.3). + + The RD can also be used to create multiple different routes to the + very same system. We have already discussed a situation in which the + route to a particular server should be different for intranet traffic + than for extranet traffic. This can be achieved by creating two + different VPN-IPv4 routes that have the same IPv4 part, but different + RDs. This allows BGP to install multiple different routes to the + same system, and allows policy to be used (see Section 4.3.5) to + decide which packets use which route. + + The RDs are structured so that every Service Provider can administer + its own "numbering space" (i.e., can make its own assignments of + RDs), without conflicting with the RD assignments made by any other + Service Provider. An RD consists of three fields: a 2-byte type + field, an administrator field, and an assigned number field. The + value of the type field determines the lengths of the other two + fields, as well as the semantics of the administrator field. The + administrator field identifies an assigned number authority, and the + assigned number field contains a number that has been assigned, by + the identified authority, for a particular purpose. For example, one + could have an RD whose administrator field contains an Autonomous + System number (ASN), and whose (4-byte) number field contains a + number assigned by the SP to whom that ASN belongs (having been + assigned to that SP by the appropriate authority). + + RDs are given this structure in order to ensure that an SP that + provides VPN backbone service can always create a unique RD when it + + + +Rosen & Rekhter Standards Track [Page 13] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + needs to do so. However, the structure is not meaningful to BGP; + when BGP compares two such address prefixes, it ignores the structure + entirely. + + A PE needs to be configured such that routes that lead to a + particular CE become associated with a particular RD. The + configuration may cause all routes leading to the same CE to be + associated with the same RD, or it may cause different routes to be + associated with different RDs, even if they lead to the same CE. + +4.2. Encoding of Route Distinguishers + + As stated, a VPN-IPv4 address consists of an 8-byte Route + Distinguisher followed by a 4-byte IPv4 address. The RDs are encoded + as follows: + + - Type Field: 2 bytes + - Value Field: 6 bytes + + The interpretation of the Value field depends on the value of the + type field. At the present time, three values of the type field are + defined: 0, 1, and 2. + + - Type 0: The Value field consists of two subfields: + + * Administrator subfield: 2 bytes + * Assigned Number subfield: 4 bytes + + The Administrator subfield must contain an Autonomous System + number. If this ASN is from the public ASN space, it must have + been assigned by the appropriate authority (use of ASN values + from the private ASN space is strongly discouraged). The + Assigned Number subfield contains a number from a numbering space + that is administered by the enterprise to which the ASN has been + assigned by an appropriate authority. + + - Type 1: The Value field consists of two subfields: + + * Administrator subfield: 4 bytes + * Assigned Number subfield: 2 bytes + + The Administrator subfield must contain an IP address. If this + IP address is from the public IP address space, it must have been + assigned by an appropriate authority (use of addresses from the + private IP address space is strongly discouraged). The Assigned + Number subfield contains a number from a numbering space which is + administered by the enterprise to which the IP address has been + assigned. + + + +Rosen & Rekhter Standards Track [Page 14] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + - Type 2: The Value field consists of two subfields: + + * Administrator subfield: 4 bytes + * Assigned Number subfield: 2 bytes + + The Administrator subfield must contain a 4-byte Autonomous + System number [BGP-AS4]. If this ASN is from the public ASN + space, it must have been assigned by the appropriate authority + (use of ASN values from the private ASN space is strongly + discouraged). The Assigned Number subfield contains a number + from a numbering space which is administered by the enterprise to + which the ASN has been assigned by an appropriate authority. + +4.3. Controlling Route Distribution + + In this section, we discuss the way in which the distribution of the + VPN-IPv4 routes is controlled. + + If a PE router is attached to a particular VPN (by being attached to + a particular CE in that VPN), it learns some of that VPN's IP routes + from the attached CE router. Routes learned from a CE routing peer + over a particular attachment circuit may be installed in the VRF + associated with that attachment circuit. Exactly which routes are + installed in this manner is determined by the way in which the PE + learns routes from the CE. In particular, when the PE and CE are + routing protocol peers, this is determined by the decision process of + the routing protocol; this is discussed in Section 7. + + These routes are then converted to VPN-IP4 routes, and "exported" to + BGP. If there is more than one route to a particular VPN-IP4 address + prefix, BGP chooses the "best" one, using the BGP decision process. + That route is then distributed by BGP to the set of other PEs that + need to know about it. At these other PEs, BGP will again choose the + best route for a particular VPN-IP4 address prefix. Then the chosen + VPN-IP4 routes are converted back into IP routes, and "imported" into + one or more VRFs. Whether they are actually installed in the VRFs + depends on the decision process of the routing method used between + the PE and those CEs that are associated with the VRF in question. + Finally, any route installed in a VRF may be distributed to the + associated CE routers. + +4.3.1. The Route Target Attribute + + Every VRF is associated with one or more Route Target (RT) + attributes. + + When a VPN-IPv4 route is created (from an IPv4 route that the PE has + learned from a CE) by a PE router, it is associated with one or more + + + +Rosen & Rekhter Standards Track [Page 15] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Route Target attributes. These are carried in BGP as attributes of + the route. + + Any route associated with Route Target T must be distributed to every + PE router that has a VRF associated with Route Target T. When such a + route is received by a PE router, it is eligible to be installed in + those of the PE's VRFs that are associated with Route Target T. + (Whether it actually gets installed depends upon the outcome of the + BGP decision process, and upon the outcome of the decision process of + the IGP (i.e., the intra-domain routing protocol) running on the + PE/CE interface.) + + A Route Target attribute can be thought of as identifying a set of + sites. (Though it would be more precise to think of it as + identifying a set of VRFs.) Associating a particular Route Target + attribute with a route allows that route to be placed in the VRFs + that are used for routing traffic that is received from the + corresponding sites. + + There is a set of Route Targets that a PE router attaches to a route + received from site S; these may be called the "Export Targets". And + there is a set of Route Targets that a PE router uses to determine + whether a route received from another PE router could be placed in + the VRF associated with site S; these may be called the "Import + Targets". The two sets are distinct, and need not be the same. Note + that a particular VPN-IPv4 route is only eligible for installation in + a particular VRF if there is some Route Target that is both one of + the route's Route Targets and one of the VRF's Import Targets. + + The function performed by the Route Target attribute is similar to + that performed by the BGP Communities attribute. However, the format + of the latter is inadequate for present purposes, since it allows + only a 2-byte numbering space. It is desirable to structure the + format, similar to what we have described for RDs (see Section 4.2), + so that a type field defines the length of an administrator field, + and the remainder of the attribute is a number from the specified + administrator's numbering space. This can be done using BGP Extended + Communities. The Route Targets discussed herein are encoded as BGP + Extended Community Route Targets [BGP-EXTCOMM]. They are structured + similarly to the RDs. + + When a BGP speaker has received more than one route to the same VPN- + IPv4 prefix, the BGP rules for route preference are used to choose + which VPN-IPv4 route is installed by BGP. + + Note that a route can only have one RD, but it can have multiple + Route Targets. In BGP, scalability is improved if one has a single + route with multiple attributes, as opposed to multiple routes. One + + + +Rosen & Rekhter Standards Track [Page 16] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + could eliminate the Route Target attribute by creating more routes + (i.e., using more RDs), but the scaling properties would be less + favorable. + + How does a PE determine which Route Target attributes to associate + with a given route? There are a number of different possible ways. + The PE might be configured to associate all routes that lead to a + specified site with a specified Route Target. Or the PE might be + configured to associate certain routes leading to a specified site + with one Route Target, and certain with another. + + If the PE and the CE are themselves BGP peers (see Section 7), then + the SP may allow the customer, within limits, to specify how its + routes are to be distributed. The SP and the customer would need to + agree in advance on the set of RTs that are allowed to be attached to + the customer's VPN routes. The CE could then attach one or more of + those RTs to each IP route that it distributes to the PE. This gives + the customer the freedom to specify in real time, within agreed-upon + limits, its route distribution policies. If the CE is allowed to + attach RTs to its routes, the PE MUST filter out all routes that + contain RTs that the customer is not allowed to use. If the CE is + not allowed to attach RTs to its routes, but does so anyway, the PE + MUST remove the RT before converting the customer's route to a VPN- + IPv4 route. + +4.3.2. Route Distribution Among PEs by BGP + + If two sites of a VPN attach to PEs that are in the same Autonomous + System, the PEs can distribute VPN-IPv4 routes to each other by means + of an IBGP connection between them. (The term "IBGP" refers to the + set of protocols and procedures used when there is a BGP connection + between two BGP speakers in the same Autonomous System. This is + distinguished from "EBGP", the set of procedures used between two BGP + speakers in different Autonomous Systems.) Alternatively, each can + have an IBGP connection to a route reflector [BGP-RR]. + + When a PE router distributes a VPN-IPv4 route via BGP, it uses its + own address as the "BGP next hop". This address is encoded as a + VPN-IPv4 address with an RD of 0. ([BGP-MP] requires that the next + hop address be in the same address family as the Network Layer + Reachability Information (NLRI).) It also assigns and distributes an + MPLS label. (Essentially, PE routers distribute not VPN-IPv4 routes, + but Labeled VPN-IPv4 routes. Cf. [MPLS-BGP].) When the PE processes + a received packet that has this label at the top of the stack, the PE + will pop the stack, and process the packet appropriately. + + + + + + +Rosen & Rekhter Standards Track [Page 17] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + The PE may distribute the exact set of routes that appears in the + VRF, or it may perform summarization and distribute aggregates of + those routes, or it may do some of one and some of the other. + + Suppose that a PE has assigned label L to route R, and has + distributed this label mapping via BGP. If R is an aggregate of a + set of routes in the VRF, the PE will know that packets from the + backbone that arrive with this label must have their destination + addresses looked up in a VRF. When the PE looks up the label in its + Label Information Base, it learns which VRF must be used. On the + other hand, if R is not an aggregate, then when the PE looks up the + label, it learns the egress attachment circuit, as well as the + encapsulation header for the packet. In this case, no lookup in the + VRF is done. + + We would expect that the most common case would be the case where the + route is NOT an aggregate. The case where it is an aggregate can be + very useful though if the VRF contains a large number of host routes + (e.g., as in dial-in), or if the VRF has an associated Local Area + Network (LAN) interface (where there is a different outgoing layer 2 + header for each system on the LAN, but a route is not distributed for + each such system). + + Whether or not each route has a distinct label is an implementation + matter. There are a number of possible algorithms one could use to + determine whether two routes get assigned the same label: + + - One may choose to have a single label for an entire VRF, so that + a single label is shared by all the routes from that VRF. Then + when the egress PE receives a packet with that label, it must + look up the packet's IP destination address in that VRF (the + packet's "egress VRF"), in order to determine the packet's egress + attachment circuit and the corresponding data link encapsulation. + + - One may choose to have a single label for each attachment + circuit, so that a single label is shared by all the routes with + the same "outgoing attachment circuit". This enables one to + avoid doing a lookup in the egress VRF, though some sort of + lookup may need to be done in order to determine the data link + encapsulation, e.g., an Address Resolution Protocol (ARP) lookup. + + - One may choose to have a distinct label for each route. Then if + a route is potentially reachable over more than one attachment + circuit, the PE/CE routing can switch the preferred path for a + route from one attachment circuit to another, without there being + any need to distribute new a label for that route. + + + + + +Rosen & Rekhter Standards Track [Page 18] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + There may be other possible algorithms as well. The choice of + algorithm is entirely at the discretion of the egress PE, and is + otherwise transparent. + + In using BGP-distributed MPLS labels in this manner, we presuppose + that an MPLS packet carrying such a label can be tunneled from the + router that installs the corresponding BGP-distributed route to the + router that is the BGP next hop of that route. This requires either + that a label switched path exist between those two routers or else + that some other tunneling technology (e.g., [MPLS-in-IP-GRE]) can be + used between them. + + This tunnel may follow a "best effort" route, or it may follow a + traffic-engineered route. Between a given pair of routers, there may + be one such tunnel, or there may be several, perhaps with different + Quality of Service (QoS) characteristics. All that matters for the + VPN architecture is that some such tunnel exists. To ensure + interoperability among systems that implement this VPN architecture + using MPLS label switched paths as the tunneling technology, all such + systems MUST support Label Distribution Protocol (LDP) [MPLS-LDP]. + In particular, Downstream Unsolicited mode MUST be supported on + interfaces that are neither Label Controlled ATM (LC-ATM) [MPLS-ATM] + nor Label Controlled Frame Relay (LC-FR) [MPLS-FR] interfaces, and + Downstream on Demand mode MUST be supported on LC-ATM interfaces and + LC-FR interfaces. + + If the tunnel follows a best-effort route, then the PE finds the + route to the remote endpoint by looking up its IP address in the + default forwarding table. + + A PE router, UNLESS it is a route reflector (see Section 4.3.3) or an + Autonomous System Border Router (ASBR) for an inter-provider VPN (see + Section 10), should not install a VPN-IPv4 route unless it has at + least one VRF with an Import Target identical to one of the route's + Route Target attributes. Inbound filtering should be used to cause + such routes to be discarded. If a new Import Target is later added + to one of the PE's VRFs (a "VPN Join" operation), it must then + acquire the routes it may previously have discarded. This can be + done using the refresh mechanism described in [BGP-RFSH]. The + outbound route filtering mechanism of [BGP-ORF] can also be used to + advantage to make the filtering more dynamic. + + Similarly, if a particular Import Target is no longer present in any + of a PE's VRFs (as a result of one or more "VPN Prune" operations), + the PE may discard all routes that, as a result, no longer have any + of the PE's VRF's Import Targets as one of their Route Target + attributes. + + + + +Rosen & Rekhter Standards Track [Page 19] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + A router that is not attached to any VPN and that is not a Route + Reflector (i.e., a P router) never installs any VPN-IPv4 routes at + all. + + Note that VPN Join and Prune operations are non-disruptive and do not + require any BGP connections to be brought down, as long as the + refresh mechanism of [BGP-RFSH] is used. + + As a result of these distribution rules, no one PE ever needs to + maintain all routes for all VPNs; this is an important scalability + consideration. + +4.3.3. Use of Route Reflectors + + Rather than having a complete IBGP mesh among the PEs, it is + advantageous to make use of BGP Route Reflectors [BGP-RR] to improve + scalability. All the usual techniques for using route reflectors to + improve scalability (e.g., route reflector hierarchies) are + available. + + Route reflectors are the only systems that need to have routing + information for VPNs to which they are not directly attached. + However, there is no need to have any one route reflector know all + the VPN-IPv4 routes for all the VPNs supported by the backbone. + + We outline below two different ways to partition the set of VPN-IPv4 + routes among a set of route reflectors. + + 1. Each route reflector is preconfigured with a list of Route + Targets. For redundancy, more than one route reflector may be + preconfigured with the same list. A route reflector uses the + preconfigured list of Route Targets to construct its inbound + route filtering. The route reflector may use the techniques of + [BGP-ORF] to install on each of its peers (regardless of + whether the peer is another route reflector or a PE) the set of + Outbound Route Filters (ORFs) that contains the list of its + preconfigured Route Targets. Note that route reflectors should + accept ORFs from other route reflectors, which means that route + reflectors should advertise the ORF capability to other route + reflectors. + + A service provider may modify the list of preconfigured Route + Targets on a route reflector. When this is done, the route + reflector modifies the ORFs it installs on all of its IBGP + peers. To reduce the frequency of configuration changes on + route reflectors, each route reflector may be preconfigured + with a block of Route Targets. This way, when a new Route + Target is needed for a new VPN, there is already one or more + + + +Rosen & Rekhter Standards Track [Page 20] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + route reflectors that are (pre)configured with this Route + Target. + + Unless a given PE is a client of all route reflectors, when a + new VPN is added to the PE ("VPN Join"), it will need to become + a client of the route reflector(s) that maintain routes for + that VPN. Likewise, deleting an existing VPN from the PE ("VPN + Prune") may result in a situation where the PE no longer needs + to be a client of some route reflector(s). In either case, the + Join or Prune operation is non-disruptive (as long as + [BGP-RFSH] is used, and never requires a BGP connection to be + brought down, only to be brought right back up. + + (By "adding a new VPN to a PE", we really mean adding a new + import Route Target to one of its VRFs, or adding a new VRF + with an import Route Target not had by any of the PE's other + VRFs.) + + 2. Another method is to have each PE be a client of some subset of + the route reflectors. A route reflector is not preconfigured + with the list of Route Targets, and does not perform inbound + route filtering of routes received from its clients (PEs); + rather, it accepts all the routes received from all of its + clients (PEs). The route reflector keeps track of the set of + the Route Targets carried by all the routes it receives. When + the route reflector receives from its client a route with a + Route Target that is not in this set, this Route Target is + immediately added to the set. On the other hand, when the + route reflector no longer has any routes with a particular + Route Target that is in the set, the route reflector should + delay (by a few hours) the deletion of this Route Target from + the set. + + The route reflector uses this set to form the inbound route + filters that it applies to routes received from other route + reflectors. The route reflector may also use ORFs to install + the appropriate outbound route filtering on other route + reflectors. Just like with the first approach, a route + reflector should accept ORFs from other route reflectors. To + accomplish this, a route reflector advertises ORF capability to + other route reflectors. + + When the route reflector changes the set, it should immediately + change its inbound route filtering. In addition, if the route + reflector uses ORFs, then the ORFs have to be immediately + changed to reflect the changes in the set. If the route + reflector doesn't use ORFs, and a new Route Target is added to + + + + +Rosen & Rekhter Standards Track [Page 21] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + the set, the route reflector, after changing its inbound route + filtering, must issue BGP Refresh to other route reflectors. + + The delay of "a few hours" mentioned above allows a route + reflector to hold onto routes with a given RT, even after it + loses the last of its clients that are interested in such + routes. This protects against the need to reacquire all such + routes if the clients' "disappearance" is only temporary. + + With this procedure, VPN Join and Prune operations are also + non-disruptive. + + Note that this technique will not work properly if some client + PE has a VRF with an import Route Target that is not one of its + export Route Targets. + + In these procedures, a PE router which attaches to a particular VPN + "auto-discovers" the other PEs that attach to the same VPN. When a + new PE router is added, or when an existing PE router attaches to a + new VPN, no reconfiguration of other PE routers is needed. + + Just as there is no one PE router that needs to know all the VPN-IPv4 + routes supported over the backbone, these distribution rules ensure + that there is no one Route Reflector (RR) that needs to know all the + VPN-IPv4 routes supported over the backbone. As a result, the total + number of such routes that can be supported over the backbone is not + bounded by the capacity of any single device, and therefore can + increase virtually without bound. + +4.3.4. How VPN-IPv4 NLRI Is Carried in BGP + + The BGP Multiprotocol Extensions [BGP-MP] are used to encode the + NLRI. If the Address Family Identifier (AFI) field is set to 1, and + the Subsequent Address Family Identifier (SAFI) field is set to 128, + the NLRI is an MPLS-labeled VPN-IPv4 address. AFI 1 is used since + the network layer protocol associated with the NLRI is still IP. + Note that this VPN architecture does not require the capability to + distribute unlabeled VPN-IPv4 addresses. + + In order for two BGP speakers to exchange labeled VPN-IPv4 NLRI, they + must use BGP Capabilities Advertisement to ensure that they both are + capable of properly processing such NLRI. This is done as specified + in [BGP-MP], by using capability code 1 (multiprotocol BGP), with an + AFI of 1 and an SAFI of 128. + + The labeled VPN-IPv4 NLRI itself is encoded as specified in + [MPLS-BGP], where the prefix consists of an 8-byte RD followed by an + IPv4 prefix. + + + +Rosen & Rekhter Standards Track [Page 22] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +4.3.5. Building VPNs Using Route Targets + + By setting up the Import Targets and Export Targets properly, one can + construct different kinds of VPNs. + + Suppose it is desired to create a fully meshed closed user group, + i.e., a set of sites where each can send traffic directly to the + other, but traffic cannot be sent to or received from other sites. + Then each site is associated with a VRF, a single Route Target + attribute is chosen, that Route Target is assigned to each VRF as + both the Import Target and the Export Target, and that Route Target + is not assigned to any other VRFs as either the Import Target or the + Export Target. + + Alternatively, suppose one desired, for whatever reason, to create a + "hub and spoke" kind of VPN. This could be done by the use of two + Route Target values, one meaning "Hub" and one meaning "Spoke". At + the VRFs attached to the hub sites, "Hub" is the Export Target and + + "Spoke" is the Import Target. At the VRFs attached to the spoke + site, "Hub" is the Import Target and "Spoke" is the Export Target. + + Thus, the methods for controlling the distribution of routing + information among various sets of sites are very flexible, which in + turn provides great flexibility in constructing VPNs. + +4.3.6. Route Distribution Among VRFs in a Single PE + + It is possible to distribute routes from one VRF to another, even if + both VRFs are in the same PE, even though in this case one cannot say + that the route has been distributed by BGP. Nevertheless, the + decision to distribute a particular route from one VRF to another + within a single PE is the same decision that would be made if the + VRFs were on different PEs. That is, it depends on the Route Target + attribute that is assigned to the route (or would be assigned if the + route were distributed by BGP), and the import target of the second + VRF. + +5. Forwarding + + If the intermediate routers in the backbone do not have any + information about the routes to the VPNs, how are packets forwarded + from one VPN site to another? + + When a PE receives an IP packet from a CE device, it chooses a + particular VRF in which to look up the packet's destination address. + This choice is based on the packet's ingress attachment circuit. + + + + +Rosen & Rekhter Standards Track [Page 23] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Assume that a match is found. As a result we learn the packet's + "next hop". + + If the packet's next hop is reached directly over a VRF attachment + circuit from this PE (i.e., the packet's egress attachment circuit is + on the same PE as its ingress attachment circuit), then the packet is + sent on the egress attachment circuit, and no MPLS labels are pushed + onto the packet's label stack. + + If the ingress and egress attachment circuits are on the same PE, but + are associated with different VRFs, and if the route that best + matches the destination address in the ingress attachment circuit's + VRF is an aggregate of several routes in the egress attachment + circuit's VRF, it may be necessary to look up the packet's + destination address in the egress VRF as well. + + If the packet's next hop is NOT reached through a VRF attachment + circuit, then the packet must travel at least one hop through the + backbone. The packet thus has a "BGP Next Hop", and the BGP Next Hop + will have assigned an MPLS label for the route that best matches the + packet's destination address. Call this label the "VPN route label". + The IP packet is turned into an MPLS packet with the VPN route label + as the sole label on the label stack. + + The packet must then be tunneled to the BGP Next Hop. + + If the backbone supports MPLS, this is done as follows: + + - The PE routers (and any Autonomous System border routers) that + redistribute VPN-IPv4 addresses need to insert /32 address + prefixes for themselves into the IGP routing tables of the + backbone. This enables MPLS, at each node in the backbone + network, to assign a label corresponding to the route to each PE + router. To ensure interoperability among different + implementations, it is required to support LDP for setting up the + label switched paths across the backbone. However, other methods + of setting up these label switched paths are also possible. + (Some of these other methods may not require the presence of the + /32 address prefixes in the IGP.) + + - If there are any traffic engineering tunnels to the BGP next hop, + and if one or more of those is available for use by the packet in + question, one of these tunnels is chosen. This tunnel will be + associated with an MPLS label, the "tunnel label". The tunnel + label gets pushed on the MPLS label stack, and the packet is + forwarded to the tunnel's next hop. + + + + + +Rosen & Rekhter Standards Track [Page 24] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + - Otherwise, + + * The packet will have an "IGP Next Hop", which is the next hop + along the IGP route to the BGP Next Hop. + + * If the BGP Next Hop and the IGP Next Hop are the same, and if + penultimate hop popping is used, the packet is then sent to + the IGP Next Hop, carrying only the VPN route label. + + * Otherwise, the IGP Next Hop will have assigned a label for + the route that best matches the address of the BGP Next Hop. + Call this the "tunnel label". The tunnel label gets pushed + on as the packet's top label. The packet is then forwarded + to the IGP Next Hop. + + - MPLS will then carry the packet across the backbone to the BGP + Next Hop, where the VPN label will be examined. + + If the backbone does not support MPLS, the MPLS packet carrying only + the VPN route label may be tunneled to the BGP Next Hop using the + techniques of [MPLS-in-IP-GRE]. When the packet emerges from the + tunnel, it will be at the BGP Next Hop, where the VPN route label + will be examined. + + At the BGP Next Hop, the treatment of the packet depends on the VPN + route label (see Section 4.3.2). In many cases, the PE will be able + to determine, from this label, the attachment circuit over which the + packet should be transmitted (to a CE device), as well as the proper + data link layer header for that interface. In other cases, the PE + may only be able to determine that the packet's destination address + needs to be looked up in a particular VRF before being forwarded to a + CE device. There are also intermediate cases in which the VPN route + label may determine the packet's egress attachment circuit, but a + lookup (e.g., ARP) still needs to be done in order to determine the + packet's data link header on that attachment circuit. + + Information in the MPLS header itself, and/or information associated + with the label, may also be used to provide QoS on the interface to + the CE. + + In any event, if the packet was an unlabeled IP packet when it + arrived at its ingress PE, it will again be an unlabeled packet when + it leaves its egress PE. + + The fact that packets with VPN route labels are tunneled through the + backbone is what makes it possible to keep all the VPN routes out of + the P routers. This is crucial to ensuring the scalability of the + + + + +Rosen & Rekhter Standards Track [Page 25] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + scheme. The backbone does not even need to have routes to the CEs, + only to the PEs. + + With respect to the tunnels, it is worth noting that this + specification: + + - DOES NOT require that the tunnels be point-to-point; multipoint- + to-point can be used; + + - DOES NOT require that there be any explicit setup of the tunnels, + either via signaling or via manual configuration; + + - DOES NOT require that there be any tunnel-specific signaling; + + - DOES NOT require that there be any tunnel-specific state in the P + or PE routers, beyond what is necessary to maintain the routing + information and (if used) the MPLS label information. + + Of course, this specification is compatible with the use of point- + to-point tunnels that must be explicitly configured and/or signaled, + and in some situations there may be reasons for using such tunnels. + + The considerations that are relevant to choosing a particular + tunneling technology are outside the scope of this specification. + +6. Maintaining Proper Isolation of VPNs + + To maintain proper isolation of one VPN from another, it is important + that no router in the backbone accept a tunneled packet from outside + the backbone, unless it is sure that both endpoints of that tunnel + are outside the backbone. + + If MPLS is being used as the tunneling technology, this means that a + router in the backbone MUST NOT accept a labeled packet from any + adjacent non-backbone device unless the following two conditions + hold: + + 1. the label at the top of the label stack was actually + distributed by that backbone router to that non-backbone + device, and + + 2. the backbone router can determine that use of that label will + cause the packet to leave the backbone before any labels lower + in the stack will be inspected, and before the IP header will + be inspected. + + The first condition ensure that any labeled packets received from + non-backbone routers have a legitimate and properly assigned label at + + + +Rosen & Rekhter Standards Track [Page 26] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + the top of the label stack. The second condition ensures that the + backbone routers will never look below that top label. Of course, + the simplest way to meet these two conditions is just to have the + backbone devices refuse to accept labeled packets from non-backbone + devices. + + If MPLS is not being used as the tunneling technology, then filtering + must be done to ensure that an MPLS-in-IP or MPLS-in-GRE packet can + be accepted into the backbone only if the packet's IP destination + address will cause it to be sent outside the backbone. + +7. How PEs Learn Routes from CEs + + The PE routers that attach to a particular VPN need to know, for each + attachment circuit leading to that VPN, which of the VPN's addresses + should be reached over that attachment circuit. + + The PE translates these addresses into VPN-IPv4 addresses, using a + configured RD. The PE then treats these VPN-IPv4 routes as input to + BGP. Routes from a VPN site are NOT leaked into the backbone's IGP. + + Exactly which PE/CE route distribution techniques are possible + depends on whether or not a particular CE is in a "transit VPN". A + "transit VPN" is one that contains a router that receives routes from + a "third party" (i.e., from a router that is not in the VPN, but is + not a PE router) and that redistributes those routes to a PE router. + A VPN that is not a transit VPN is a "stub VPN". The vast majority + of VPNs, including just about all corporate enterprise networks, + would be expected to be "stubs" in this sense. + + The possible PE/CE distribution techniques are: + + 1. Static routing (i.e., configuration) may be used. (This is + likely to be useful only in stub VPNs.) + + 2. PE and CE routers may be Routing Information Protocol (RIP) + [RIP] peers, and the CE may use RIP to tell the PE router the + set of address prefixes that are reachable at the CE router's + site. When RIP is configured in the CE, care must be taken to + ensure that address prefixes from other sites (i.e., address + prefixes learned by the CE router from the PE router) are never + advertised to the PE. More precisely: if a PE router, say, + PE1, receives a VPN-IPv4 route R1, and as a result distributes + an IPv4 route R2 to a CE, then R2 must not be distributed back + from that CE's site to a PE router, say, PE2, (where PE1 and + PE2 may be the same router or different routers), unless PE2 + maps R2 to a VPN-IPv4 route that is different than (i.e., + contains a different RD than) R1. + + + +Rosen & Rekhter Standards Track [Page 27] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + 3. The PE and CE routers may be OSPF peers. A PE router that is + an OSPF peer of a CE router appears, to the CE router, to be an + area 0 router. If a PE router is an OSPF peer of CE routers + that are in distinct VPNs, the PE must of course be running + multiple instances of OSPF. + + IPv4 routes that the PE learns from the CE via OSPF are + redistributed into BGP as VPN-IPv4 routes. Extended Community + attributes are used to carry, along with the route, all the + information needed to enable the route to be distributed to + other CE routers in the VPN in the proper type of OSPF Link + State Advertisement (LSA). OSPF route tagging is used to + ensure that routes received from the MPLS/BGP backbone are not + sent back into the backbone. + + Specification of the complete set of procedures for the use of + OSPF between PE and CE can be found in [VPN-OSPF] and + [OSPF-2547-DNBIT]. + + 4. The PE and CE routers may be BGP peers, and the CE router may + use BGP (in particular, EBGP to tell the PE router the set of + address prefixes that are at the CE router's site. (This + technique can be used in stub VPNs or transit VPNs.) + + This technique has a number of advantages over the others: + + a) Unlike the IGP alternatives, this does not require the PE + to run multiple routing algorithm instances in order to + talk to multiple CEs. + + b) BGP is explicitly designed for just this function: + passing routing information between systems run by + different administrations. + + c) If the site contains "BGP backdoors", i.e., routers with + BGP connections to routers other than PE routers, this + procedure will work correctly in all circumstances. The + other procedures may or may not work, depending on the + precise circumstances. + + d) Use of BGP makes it easy for the CE to pass attributes of + the routes to the PE. A complete specification of the + set of attributes and their use is outside the scope of + this document. However, some examples of the way this + may be used are the following: + + + + + + +Rosen & Rekhter Standards Track [Page 28] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + - The CE may suggest a particular Route Target for each + route, from among the Route Targets that the PE is + authorized to attach to the route. The PE would then + attach only the suggested Route Target, rather than + the full set. This gives the CE administrator some + dynamic control of the distribution of routes from + the CE. + + - Additional types of Extended Community attributes may + be defined, where the intention is to have those + attributes passed transparently (i.e., without being + changed by the PE routers) from CE to CE. This would + allow CE administrators to implement additional route + filtering, beyond that which is done by the PEs. + This additional filtering would not require + coordination with the SP. + + On the other hand, using BGP may be something new for the CE + administrators. + + If a site is not in a transit VPN, note that it need not have a + unique Autonomous System Number (ASN). Every CE whose site is + not in a transit VPN can use the same ASN. This can be chosen + from the private ASN space, and it will be stripped out by the + PE. Routing loops are prevented by use of the Site of Origin + attribute (see below). + + What if a set of sites constitutes a transit VPN? This will + generally be the case only if the VPN is itself an Internet + Service Provider's (ISP's) network, where the ISP is itself + buying backbone services from another SP. The latter SP may be + called a "carrier's carrier". In this case, the best way to + provide the VPN is to have the CE routers support MPLS, and to + use the technique described in Section 9. + + When we do not need to distinguish among the different ways in which + a PE can be informed of the address prefixes that exist at a given + site, we will simply say that the PE has "learned" the routes from + that site. This includes the case where the PE has been manually + configured with the routes. + + Before a PE can redistribute a VPN-IPv4 route learned from a site, it + must assign a Route Target attribute (see Section 4.3.1) to the + route, and it may assign a Site of Origin attribute to the route. + + The Site of Origin attribute, if used, is encoded as a Route Origin + Extended Community [BGP-EXTCOMM]. The purpose of this attribute is + to uniquely identify the set of routes learned from a particular + + + +Rosen & Rekhter Standards Track [Page 29] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + site. This attribute is needed in some cases to ensure that a route + learned from a particular site via a particular PE/CE connection is + not distributed back to the site through a different PE/CE + connection. It is particularly useful if BGP is being used as the + PE/CE protocol, but different sites have not been assigned distinct + ASNs. + +8. How CEs Learn Routes from PEs + + In this section, we assume that the CE device is a router. + + If the PE places a particular route in the VRF it uses to route + packets received from a particular CE, then in general, the PE may + distribute that route to the CE. Of course, the PE may distribute + that route to the CE only if this is permitted by the rules of the + PE/CE protocol. (For example, if a particular PE/CE protocol has + "split horizon", certain routes in the VRF cannot be redistributed + back to the CE.) We add one more restriction on the distribution of + routes from PE to CE: if a route's Site of Origin attribute + identifies a particular site, that route must never be redistributed + to any CE at that site. + + In most cases, however, it will be sufficient for the PE to simply + distribute the default route to the CE. (In some cases, it may even + be sufficient for the CE to be configured with a default route + pointing to the PE.) This will generally work at any site that does + not itself need to distribute the default route to other sites. + (E.g., if one site in a corporate VPN has the corporation's access to + the Internet, that site might need to have default distributed to the + other site, but one could not distribute default to that site + itself.) + + Whatever procedure is used to distribute routes from CE to PE will + also be used to distribute routes from PE to CE. + +9. Carriers' Carriers + + Sometimes a VPN may actually be the network of an ISP, with its own + peering and routing policies. Sometimes a VPN may be the network of + an SP that is offering VPN services in turn to its own customers. + VPNs like these can also obtain backbone service from another SP, the + "carrier's carrier", using essentially the same methods described in + this document. However, it is necessary in these cases that the CE + routers support MPLS. In particular: + + - The CE routers should distribute to the PE routers ONLY those + routes that are internal to the VPN. This allows the VPN to be + handled as a stub VPN. + + + +Rosen & Rekhter Standards Track [Page 30] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + - The CE routers should support MPLS, in that they should be able + to receive labels from the PE routers, and send labeled packets + to the PE routers. They do not need to distribute labels of + their own, though. + + - The PE routers should distribute, to the CE routers, labels for + the routes they distribute to the CE routers. + + The PE must not distribute the same label to two different CEs + unless one of the following conditions holds: + + * The two CEs are associated with exactly the same set of VRFs; + + * The PE maintains a different Incoming Label Map ([MPLS-ARCH]) + for each CE. + + Further, when the PE receives a labeled packet from a CE, it must + verify that the top label is one that was distributed to that CE. + + - Routers at the different sites should establish BGP connections + among themselves for the purpose of exchanging external routes + (i.e., routes that lead outside of the VPN). + + - All the external routes must be known to the CE routers. + + Then when a CE router looks up a packet's destination address, the + routing lookup will resolve to an internal address, usually the + address of the packet's BGP next hop. The CE labels the packet + appropriately and sends the packet to the PE. The PE, rather than + looking up the packet's IP destination address in a VRF, uses the + packet's top MPLS label to select the BGP next hop. As a result, if + the BGP next hop is more than one hop away, the top label will be + replaced by two labels, a tunnel label and a VPN route label. If the + BGP next hop is one hop away, the top label may be replaced by just + the VPN route label. If the ingress PE is also the egress PE, the + top label will just be popped. When the packet is sent from its + egress PE to a CE, the packet will have one fewer MPLS labels than it + had when it was first received by its ingress PE. + + In the above procedure, the CE routers are the only routers in the + VPN that need to support MPLS. If, on the other hand, all the + routers at a particular VPN site support MPLS, then it is no longer + required that the CE routers know all the external routes. All that + is required is that the external routes be known to whatever routers + are responsible for putting the label stack on a hitherto unlabeled + packet and that there be label switched path that leads from those + routers to their BGP peers at other sites. In this case, for each + + + + +Rosen & Rekhter Standards Track [Page 31] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + internal route that a CE router distributes to a PE router, it must + also distribute a label. + +10. Multi-AS Backbones + + What if two sites of a VPN are connected to different Autonomous + Systems (e.g., because the sites are connected to different SPs)? + The PE routers attached to that VPN will then not be able to maintain + IBGP connections with each other, or with a common route reflector. + Rather, there needs to be some way to use EBGP to distribute VPN-IPv4 + addresses. + + There are a number of different ways of handling this case, which we + present in order of increasing scalability. + + a) VRF-to-VRF connections at the AS (Autonomous System) border + routers. + + In this procedure, a PE router in one AS attaches directly to a + PE router in another. The two PE routers will be attached by + multiple sub-interfaces, at least one for each of the VPNs + whose routes need to be passed from AS to AS. Each PE will + treat the other as if it were a CE router. That is, the PEs + associate each such sub-interface with a VRF, and use EBGP to + distribute unlabeled IPv4 addresses to each other. + + This is a procedure that "just works", and that does not + require MPLS at the border between ASes. However, it does not + scale as well as the other procedures discussed below. + + b) EBGP redistribution of labeled VPN-IPv4 routes from AS to + neighboring AS. + + In this procedure, the PE routers use IBGP to redistribute + labeled VPN-IPv4 routes either to an Autonomous System Border + Router (ASBR), or to a route reflector of which an ASBR is a + client. The ASBR then uses EBGP to redistribute those labeled + VPN-IPv4 routes to an ASBR in another AS, which in turn + distributes them to the PE routers in that AS, or perhaps to + another ASBR which in turn distributes them, and so on. + + When using this procedure, VPN-IPv4 routes should only be + accepted on EBGP connections at private peering points, as part + of a trusted arrangement between SPs. VPN-IPv4 routes should + neither be distributed to nor accepted from the public + Internet, or from any BGP peers that are not trusted. An ASBR + should never accept a labeled packet from an EBGP peer unless + it has actually distributed the top label to that peer. + + + +Rosen & Rekhter Standards Track [Page 32] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + If there are many VPNs having sites attached to different + Autonomous Systems, there does not need to be a single ASBR + between those two ASes that holds all the routes for all the + VPNs; there can be multiple ASBRs, each of which holds only the + routes for a particular subset of the VPNs. + + This procedure requires that there be a label switched path + leading from a packet's ingress PE to its egress PE. Hence the + appropriate trust relationships must exist between and among + the set of ASes along the path. Also, there must be agreement + among the set of SPs as to which border routers need to receive + routes with which Route Targets. + + c) Multi-hop EBGP redistribution of labeled VPN-IPv4 routes + between source and destination ASes, with EBGP redistribution + of labeled IPv4 routes from AS to neighboring AS. + + In this procedure, VPN-IPv4 routes are neither maintained nor + distributed by the ASBRs. An ASBR must maintain labeled IPv4 + /32 routes to the PE routers within its AS. It uses EBGP to + distribute these routes to other ASes. ASBRs in any transit + ASes will also have to use EBGP to pass along the labeled /32 + routes. This results in the creation of a label switched path + from the ingress PE router to the egress PE router. Now PE + routers in different ASes can establish multi-hop EBGP + connections to each other, and can exchange VPN-IPv4 routes + over those connections. + + If the /32 routes for the PE routers are made known to the P + routers of each AS, everything works normally. If the /32 + routes for the PE routers are NOT made known to the P routers + (other than the ASBRs), then this procedure requires a packet's + ingress PE to put a three-label stack on it. The bottom label + is assigned by the egress PE, corresponding to the packet's + destination address in a particular VRF. The middle label is + assigned by the ASBR, corresponding to the /32 route to the + egress PE. The top label is assigned by the ingress PE's IGP + Next Hop, corresponding to the /32 route to the ASBR. + + To improve scalability, one can have the multi-hop EBGP + connections exist only between a route reflector in one AS and + a route reflector in another. (However, when the route + reflectors distribute routes over this connection, they do not + modify the BGP next hop attribute of the routes.) The actual + PE routers would then only have IBGP connections to the route + reflectors in their own AS. + + + + + +Rosen & Rekhter Standards Track [Page 33] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + This procedure is very similar to the "carrier's carrier" + procedures described in Section 9. Like the previous + procedure, it requires that there be a label switched path + leading from a packet's ingress PE to its egress PE. + +11. Accessing the Internet from a VPN + + Many VPN sites will need to be able to access the public Internet, as + well as to access other VPN sites. The following describes some of + the alternative ways of doing this. + + 1. In some VPNs, one or more of the sites will obtain Internet + access by means of an "Internet gateway" (perhaps a firewall) + attached to a non-VRF interface to an ISP. The ISP may or may + not be the same organization as the SP that is providing the + VPN service. Traffic to/from the Internet gateway would then + be routed according to the PE router's default forwarding + table. + + In this case, the sites that have Internet access may be + distributing a default route to their PEs, which in turn + redistribute it to other PEs and hence into other sites of the + VPN. This provides Internet access for all of the VPN's sites. + + In order to properly handle traffic from the Internet, the ISP + must distribute, to the Internet, routes leading to addresses + that are within the VPN. This is completely independent of any + of the route distribution procedures described in this + document. The internal structure of the VPN will in general + not be visible from the Internet; such routes would simply lead + to the non-VRF interface that attaches to the VPN's Internet + gateway. + + In this model, there is no exchange of routes between a PE + router's default forwarding table and any of its VRFs. VPN + route distribution procedures and Internet route distribution + procedures are completely independent. + + Note that although some sites of the VPN use a VRF interface to + communicate with the Internet, ultimately all packets to/from + the Internet traverse a non-VRF interface before + leaving/entering the VPN, so we refer to this as "non-VRF + Internet access". + + Note that the PE router to which the non-VRF interface attaches + does not necessarily need to maintain all the Internet routes + in its default forwarding table. The default forwarding table + could have as few as one route, "default", which leads to + + + +Rosen & Rekhter Standards Track [Page 34] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + another router (probably an adjacent one) that has the Internet + routes. A variation of this scheme is to tunnel packets + received over the non-VRF interface from the PE router to + another router, where this other router maintains the full set + of Internet routes. + + 2. Some VPNs may obtain Internet access via a VRF interface ("VRF + Internet access"). If a packet is received by a PE over a VRF + interface, and if the packet's destination address does not + match any route in the VRF, then it may be matched against the + PE's default forwarding table. If a match is made there, the + packet can be forwarded natively through the backbone to the + Internet, instead of being forwarded by MPLS. + + In order for traffic to flow natively in the opposite direction + (from Internet to VRF interface), some of the routes from the + VRF must be exported to the Internet forwarding table. + Needless to say, any such routes must correspond to globally + unique addresses. + + In this scheme, the default forwarding table might have the + full set of Internet routes, or it might have as little as a + single default route leading to another router that does have + the full set of Internet routes in its default forwarding + table. + + 3. Suppose the PE has the capability to store "non-VPN routes" in + a VRF. If a packet's destination address matches a "non-VPN + route", then the packet is transmitted natively, rather than + being transmitted via MPLS. If the VRF contains a non-VPN + default route, all packets for the public Internet will match + it, and be forwarded natively to the default route's next hop. + At that next hop, the packets' destination addresses will be + looked up in the default forwarding table, and may match more + specific routes. + + This technique would only be available if none of the CE + routers is distributing a default route. + + 4. It is also possible to obtain Internet access via a VRF + interface by having the VRF contain the Internet routes. + Compared with model 2, this eliminates the second lookup, but + it has the disadvantage of requiring the Internet routes to be + replicated in each such VRF. + + If this technique is used, the SP may want to make its + interface to the Internet be a VRF interface, and to use the + + + + +Rosen & Rekhter Standards Track [Page 35] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + techniques of Section 4 to distribute Internet routes, as VPN- + IPv4 routes, to other VRFs. + + It should be clearly understood that by default, there is no exchange + of routes between a VRF and the default forwarding table. This is + done ONLY upon agreement between a customer and an SP, and only if it + suits the customer's policies. + +12. Management VPNs + + This specification does not require that the sub-interface connecting + a PE router and a CE router be a "numbered" interface. If it is a + numbered interface, this specification allows the addresses assigned + to the interface to come from either the address space of the VPN or + the address space of the SP. + + If a CE router is being managed by the Service Provider, then the + Service Provider will likely have a network management system that + needs to be able to communicate with the CE router. In this case, + the addresses assigned to the sub-interface connecting the CE and PE + routers should come from the SP's address space, and should be unique + within that space. The network management system should itself + connect to a PE router (more precisely, be at a site that connects to + a PE router) via a VRF interface. The address of the network + management system will be exported to all VRFs that are associated + with interfaces to CE routers that are managed by the SP. The + addresses of the CE routers will be exported to the VRF associated + with the network management system, but not to any other VRFs. + + This allows communication between the CE and network management + system, but does not allow any undesired communication to or among + the CE routers. + + One way to ensure that the proper route import/exports are done is to + use two Route Targets; call them T1 and T2. If a particular VRF + interface attaches to a CE router that is managed by the SP, then + that VRF is configured to: + + - import routes that have T1 attached to them, and + + - attach T2 to addresses assigned to each end of its VRF + interfaces. + + If a particular VRF interface attaches to the SP's network management + system, then that VRF is configured to attach T1 to the address of + that system, and to import routes that have T2 attached to them. + + + + + +Rosen & Rekhter Standards Track [Page 36] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +13. Security Considerations + +13.1. Data Plane + + By security in the "data plane", we mean protection against the + following possibilities: + + - Packets from within a VPN travel to a site outside the VPN, other + than in a manner consistent with the policies of the VPN. + + - Packets from outside a VPN enter one of the VPN's sites, other + than in a manner consistent with the policies of the VPN. + + Under the following conditions: + + 1. a backbone router does not accept labeled packets over a + particular data link, unless it is known that that data link + attaches only to trusted systems, or unless it is known that + such packets will leave the backbone before the IP header or + any labels lower in the stack will be inspected, and + + 2. labeled VPN-IPv4 routes are not accepted from untrusted or + unreliable routing peers, + + 3. no successful attacks have been mounted on the control plane, + + the data plane security provided by this architecture is virtually + identical to that provided to VPNs by Frame Relay or ATM backbones. + If the devices under the control of the SP are properly configured, + data will not enter or leave a VPN unless authorized to do so. + + Condition 1 above can be stated more precisely. One should discard a + labeled packet received from a particular neighbor unless one of the + following two conditions holds: + + - the packet's top label has a label value that the receiving + system has distributed to that neighbor, or + + - the packet's top label has a label value that the receiving + system has distributed to a system beyond that neighbor (i.e., + when it is known that the path from the system to which the label + was distributed to the receiving system may be via that + neighbor). + + + + + + + + +Rosen & Rekhter Standards Track [Page 37] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Condition 2 above is of most interest in the case of inter-provider + VPNs (see Section 10). For inter-provider VPNs constructed according + to scheme b) of Section 10, condition 2 is easily checked. (The + issue of security when scheme (c) of Section 10 is used is for + further study.) + + It is worth noting that the use of MPLS makes it much simpler to + provide data plane security than might be possible if one attempted + to use some form of IP tunneling in place of the MPLS outer label. + It is a simple matter to have one's border routers refuse to accept a + labeled packet unless the first of the above conditions applies to + it. It is rather more difficult to configure a router to refuse to + accept an IP packet if that packet is an IP tunneled packet whose + destination address is that of a PE router; certainly, this is not + impossible to do, but it has both management and performance + implications. + + MPLS-in-IP and MPLS-in-GRE tunneling are specified in + [MPLS-in-IP-GRE]. If it is desired to use such tunnels to carry VPN + packets, then the security considerations described in Section 8 of + that document must be fully understood. Any implementation of + BGP/MPLS IP VPNs that allows VPN packets to be tunneled as described + in that document MUST contain an implementation of IPsec that can be + used as therein described. If the tunnel is not secured by IPsec, + then the technique of IP address filtering at the border routers, + described in Section 8.2 of that document, is the only means of + ensuring that a packet that exits the tunnel at a particular egress + PE was actually placed in the tunnel by the proper tunnel head node + (i.e., that the packet does not have a spoofed source address). + Since border routers frequently filter only source addresses, packet + filtering may not be effective unless the egress PE can check the IP + source address of any tunneled packet it receives, and compare it to + a list of IP addresses that are valid tunnel head addresses. Any + implementation that allows MPLS-in-IP and/or MPLS-in-GRE tunneling to + be used without IPsec MUST allow the egress PE to validate in this + manner the IP source address of any tunneled packet that it receives. + + In the case where a number of CE routers attach to a PE router via a + LAN interface, to ensure proper security, one of the following + conditions must hold: + + 1. All the CE routers on the LAN belong to the same VPN, or + + 2. A trusted and secured LAN switch divides the LAN into multiple + VLANs, with each VLAN containing only systems of a single VPN; + in this case, the switch will attach the appropriate VLAN tag + to any packet before forwarding it to the PE router. + + + + +Rosen & Rekhter Standards Track [Page 38] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Cryptographic privacy is not provided by this architecture, nor by + Frame Relay or ATM VPNs. These architectures are all compatible with + the use of cryptography on a CE-CE basis, if that is desired. + + The use of cryptography on a PE-PE basis is for further study. + +13.2. Control Plane + + The data plane security of the previous section depends on the + security of the control plane. To ensure security, neither BGP nor + LDP connections should be made with untrusted peers. The TCP/IP MD5 + authentication option [TCP-MD5] should be used with both these + protocols. The routing protocol within the SP's network should also + be secured in a similar manner. + +13.3. Security of P and PE Devices + + If the physical security of these devices is compromised, data plane + security may also be compromised. + + The usual steps should be taken to ensure that IP traffic from the + public Internet cannot be used to modify the configuration of these + devices, or to mount Denial of Service attacks on them. + +14. Quality of Service + + Although not the focus of this paper, Quality of Service is a key + component of any VPN service. In MPLS/BGP VPNs, existing L3 QoS + capabilities can be applied to labeled packets through the use of the + "experimental" bits in the shim header [MPLS-ENCAPS], or, where ATM + is used as the backbone, through the use of ATM QoS capabilities. + The traffic engineering work discussed in [MPLS-RSVP] is also + directly applicable to MPLS/BGP VPNs. Traffic engineering could even + be used to establish label switched paths with particular QoS + characteristics between particular pairs of sites, if that is + desirable. Where an MPLS/BGP VPN spans multiple SPs, the + architecture described in [PASTE] may be useful. An SP may apply + either intserv (Integrated Services) or diffserv (Differentiated + Services) capabilities to a particular VPN, as appropriate. + + + + + + + + + + + + +Rosen & Rekhter Standards Track [Page 39] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +15. Scalability + + We have discussed scalability issues throughout this paper. In this + section, we briefly summarize the main characteristics of our model + with respect to scalability. + + The Service Provider backbone network consists of (a) PE routers, (b) + BGP Route Reflectors, (c) P routers (that are neither PE routers nor + Route Reflectors), and, in the case of multi-provider VPNs, (d) + ASBRs. + + P routers do not maintain any VPN routes. In order to properly + forward VPN traffic, the P routers need only maintain routes to the + PE routers and the ASBRs. The use of two levels of labeling is what + makes it possible to keep the VPN routes out of the P routers. + + A PE router maintains VPN routes, but only for those VPNs to which it + is directly attached. + + Route reflectors can be partitioned among VPNs so that each partition + carries routes for only a subset of the VPNs supported by the Service + Provider. Thus, no single route reflector is required to maintain + routes for all VPNs. + + For inter-provider VPNs, if the ASBRs maintain and distribute VPN- + IPv4 routes, then the ASBRs can be partitioned among VPNs in a + similar manner, with the result that no single ASBR is required to + maintain routes for all the inter-provider VPNs. If multi-hop EBGP + is used, then the ASBRs need not maintain and distribute VPN-IPv4 + routes at all. + + As a result, no single component within the Service Provider network + has to maintain all the routes for all the VPNs. So the total + capacity of the network to support increasing numbers of VPNs is not + limited by the capacity of any individual component. + +16. IANA Considerations + + The Internet Assigned Numbers Authority (IANA) has created a new + registry for the "Route Distinguisher Type Field" (see Section 4.2). + This is a two-byte field. Types 0, 1, and 2 are defined by this + document. Additional Route Distinguisher Type Field values with a + high-order bit of 0 may be allocated by IANA on a "First Come, First + Served" basis [IANA]. Values with a high-order bit of 1 may be + allocated by IANA based on "IETF consensus" [IANA]. + + + + + + +Rosen & Rekhter Standards Track [Page 40] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + This document specifies (see Section 4.3.4) the use of the BGP + Address Family Identifier (AFI) value 1, along with the BGP + Subsequent Address Family Identifier (SAFI) value 128, to represent + the address family "VPN-IPv4 Labeled Addresses", which is defined in + this document. + + The use of AFI value 1 for IP is as currently specified in the IANA + registry "Address Family Identifier", so IANA need take no action + with respect to it. + + The SAFI value 128 was originally specified as "Private Use" in the + IANA "Subsequent Address Family Identifier" registry. IANA has + changed the SAFI value 128 from "private use" to "MPLS-labeled VPN + address". + +17. Acknowledgements + + The full list of contributors can be found in Section 18. + + Significant contributions to this work have also been made by Ravi + Chandra, Dan Tappan, and Bob Thomas. + + We also wish to thank Shantam Biswas for his review and + contributions. + +18. Contributors + + Tony Bogovic + Telcordia Technologies + 445 South Street, Room 1A264B + Morristown, NJ 07960 + + EMail: tjb@research.telcordia.com + + + Stephen John Brannon + Swisscom AG + Postfach 1570 + CH-8301 + Glattzentrum (Zuerich), Switzerland + + EMail: stephen.brannon@swisscom.com + + + + + + + + + +Rosen & Rekhter Standards Track [Page 41] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Marco Carugi + Nortel Networks S.A. + Parc d'activites de Magny-Les Jeunes Bois CHATEAUFORT + 78928 YVELINES Cedex 9 - FRANCE + + EMail: marco.carugi@nortelnetworks.com + + + Christopher J. Chase + AT&T + 200 Laurel Ave + Middletown, NJ 07748 + USA + + EMail: chase@att.com + + + Ting Wo Chung + Bell Nexxia + 181 Bay Street + Suite 350 + Toronto, Ontario + M5J2T3 + + EMail: ting_wo.chung@bellnexxia.com + + + Eric Dean + + + Jeremy De Clercq + Alcatel Network Strategy Group + Francis Wellesplein 1 + 2018 Antwerp, Belgium + + EMail: jeremy.de_clercq@alcatel.be + + + Luyuan Fang + AT&T + IP Backbone Architecture + 200 Laurel Ave. + Middletown, NJ 07748 + + EMail: luyuanfang@att.com + + + + + + +Rosen & Rekhter Standards Track [Page 42] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Paul Hitchen + BT + BT Adastral Park + Martlesham Heath, + Ipswich IP5 3RE + UK + + EMail: paul.hitchen@bt.com + + + Manoj Leelanivas + Juniper Networks, Inc. + 385 Ravendale Drive + Mountain View, CA 94043 USA + + EMail: manoj@juniper.net + + + Dave Marshall + Worldcom + 901 International Parkway + Richardson, Texas 75081 + + EMail: dave.marshall@wcom.com + + + Luca Martini + Cisco Systems, Inc. + 9155 East Nichols Avenue, Suite 400 + Englewood, CO, 80112 + + EMail: lmartini@cisco.com + + + Monique Jeanne Morrow + Cisco Systems, Inc. + Glatt-com, 2nd floor + CH-8301 + Glattzentrum, Switzerland + + EMail: mmorrow@cisco.com + + + + + + + + + + +Rosen & Rekhter Standards Track [Page 43] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + Ravichander Vaidyanathan + Telcordia Technologies + 445 South Street, Room 1C258B + Morristown, NJ 07960 + + EMail: vravi@research.telcordia.com + + + Adrian Smith + BT + BT Adastral Park + Martlesham Heath, + Ipswich IP5 3RE + UK + + EMail: adrian.ca.smith@bt.com + + + Vijay Srinivasan + 1200 Bridge Parkway + Redwood City, CA 94065 + + EMail: vsriniva@cosinecom.com + + + Alain Vedrenne + Equant + Heraklion, 1041 route des Dolines, BP347 + 06906 Sophia Antipolis, Cedex, France + + EMail: Alain.Vedrenne@equant.com + +19. Normative References + + [BGP] Rekhter, Y. and T. Li, "A Border Gateway Protocol 4 + (BGP-4)", RFC 4271, January 2006. + + [BGP-MP] Bates, T., Rekhter, Y., Chandra, R., and D. Katz, + "Multiprotocol Extensions for BGP-4", RFC 2858, + June 2000. + + [BGP-EXTCOMM] Sangli, S., Tappan, D., and Y. Rekhter, "BGP + Extended Communities Attribute", RFC 4360, February + 2006. + + [MPLS-ARCH] Rosen, E., Viswanathan, A., and R. Callon, + "Multiprotocol Label Switching Architecture", RFC + 3031, January 2001. + + + +Rosen & Rekhter Standards Track [Page 44] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + [MPLS-BGP] Rekhter, Y. and E. Rosen, "Carrying Label + Information in BGP-4", RFC 3107, May 2001. + + [MPLS-ENCAPS] Rosen, E., Tappan, D., Fedorkow, G., Rekhter, Y., + Farinacci, D., Li, T., and A. Conta, "MPLS Label + Stack Encoding", RFC 3032, January 2001. + +20. Informative References + + [BGP-AS4] Vohra, Q. and E. Chen, "BGP Support for Four-Octet + AS Number Space", Work in Progress, March 2004. + + [BGP-ORF] Chen, E. and Y. Rekhter, "Cooperative Route + Filtering Capability for BGP-4", Work in Progress, + March 2004. + + [BGP-RFSH] Chen, E., "Route Refresh Capability for BGP-4", RFC + 2918, September 2000. + + [BGP-RR] Bates, T., Chandra, R., and E. Chen, "BGP Route + Reflection - An Alternative to Full Mesh IBGP", RFC + 2796, April 2000. + + [IANA] Narten, T. and H. Alvestrand, "Guidelines for + Writing an IANA Considerations Section in RFCs", + BCP 26, RFC 2434, October 1998. + + [MPLS-ATM] Davie, B., Lawrence, J., McCloghrie, K., Rosen, E., + Swallow, G., Rekhter, Y., and P. Doolan, "MPLS + using LDP and ATM VC Switching", RFC 3035, January + 2001. + + [MPLS/BGP-IPsec] Rosen, E., De Clercq, J., Paridaens, O., T'Joens, + Y., and C. Sargor, "Architecture for the Use of + PE-PE IPsec Tunnels in BGP/MPLS IP VPNs", Work in + Progress, March 2004. + + [MPLS-FR] Conta, A., Doolan, P., and A. Malis, "Use of Label + Switching on Frame Relay Networks Specification", + RFC 3034, January 2001. + + [MPLS-in-IP-GRE] Worster, T., Rekhter, Y., and E. Rosen, + "Encapsulating MPLS in IP or Generic Routing + Encapsulation (GRE)", RFC 4023, March 2005. + + [MPLS-LDP] Andersson, L., Doolan, P., Feldman, N., Fredette, + A., and B. Thomas, "LDP Specification", RFC 3036, + January 2001. + + + +Rosen & Rekhter Standards Track [Page 45] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + + [MPLS-RSVP] Awduche, D., Berger, L., Gan, D., Li, T., + Srinivasan, V., and G. Swallow, "RSVP-TE: + Extensions to RSVP for LSP Tunnels", RFC 3209, + December 2001. + + [OSPFv2] Moy, J., "OSPF Version 2", STD 54, RFC 2328, April + 1998. + + [PASTE] Li, T. and Y. Rekhter, "A Provider Architecture for + Differentiated Services and Traffic Engineering + (PASTE)", RFC 2430, October 1998. + + [RIP] Malkin, G., "RIP Version 2", STD 56, RFC 2453, + November 1998. + + [OSPF-2547-DNBIT] Rosen, E., Psenak, P., and P. Pillay-Esnault, + "Using an LSA Options Bit to Prevent Looping in + BGP/MPLS IP VPNs", Work in Progress, March 2004. + + [TCP-MD5] Heffernan, A., "Protection of BGP Sessions via the + TCP MD5 Signature Option", RFC 2385, August 1998. + + [VPN-MCAST] Rosen, E., Cai, Y., and J. Wijsnands, "Multicast in + MPLS/BGP VPNs", Work in Progress, May 2004. + + [VPN-OSPF] Rosen, E., Psenak, P., and P. Pillay-Esnault, "OSPF + as the PE/CE Protocol in BGP/MPLS VPNs", Work in + Progress, February 2004. + +Authors' Addresses + + Eric C. Rosen + Cisco Systems, Inc. + 1414 Massachusetts Avenue + Boxborough, MA 01719 + + EMail: erosen@cisco.com + + + Yakov Rekhter + Juniper Networks + 1194 N. Mathilda Avenue + Sunnyvale, CA 94089 + + EMail: yakov@juniper.net + + + + + + +Rosen & Rekhter Standards Track [Page 46] + +RFC 4364 BGP/MPLS IP VPNs February 2006 + + +Full Copyright Statement + + Copyright (C) The Internet Society (2006). + + This document is subject to the rights, licenses and restrictions + contained in BCP 78, and except as set forth therein, the authors + retain all their rights. + + This document and the information contained herein are provided on an + "AS IS" basis and THE CONTRIBUTOR, THE ORGANIZATION HE/SHE REPRESENTS + OR IS SPONSORED BY (IF ANY), THE INTERNET SOCIETY AND THE INTERNET + ENGINEERING TASK FORCE DISCLAIM ALL WARRANTIES, EXPRESS OR IMPLIED, + INCLUDING BUT NOT LIMITED TO ANY WARRANTY THAT THE USE OF THE + INFORMATION HEREIN WILL NOT INFRINGE ANY RIGHTS OR ANY IMPLIED + WARRANTIES OF MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE. + +Intellectual Property + + The IETF takes no position regarding the validity or scope of any + Intellectual Property Rights or other rights that might be claimed to + pertain to the implementation or use of the technology described in + this document or the extent to which any license under such rights + might or might not be available; nor does it represent that it has + made any independent effort to identify any such rights. Information + on the procedures with respect to rights in RFC documents can be + found in BCP 78 and BCP 79. + + Copies of IPR disclosures made to the IETF Secretariat and any + assurances of licenses to be made available, or the result of an + attempt made to obtain a general license or permission for the use of + such proprietary rights by implementers or users of this + specification can be obtained from the IETF on-line IPR repository at + http://www.ietf.org/ipr. + + The IETF invites any interested party to bring to its attention any + copyrights, patents or patent applications, or other proprietary + rights that may cover technology that may be required to implement + this standard. Please address the information to the IETF at + ietf-ipr@ietf.org. + +Acknowledgement + + Funding for the RFC Editor function is provided by the IETF + Administrative Support Activity (IASA). + + + + + + + +Rosen & Rekhter Standards Track [Page 47] + diff --git a/monitor/monitor_backend.go b/monitor/monitor_backend.go @@ -0,0 +1,22 @@ +package monitor + +import ( + //"log" + "github.com/gocql/gocql" +) + +type CassandraContext struct { + cconfig *gocql.ClusterConfig + session *gocql.Session +} + +func (c *CassandraContext) StartCassandra(kspace string, addrs ...string) (err error) { + c.cconfig = gocql.NewCluster(addrs...) + c.cconfig.Keyspace = kspace + c.session, err = c.cconfig.CreateSession() + return +} + +func (c *CassandraContext) StopCassandra() { + c.session.Close() +} diff --git a/monitor/monitor_backend_test.go b/monitor/monitor_backend_test.go @@ -0,0 +1,32 @@ +package monitor + +import ( + "testing" +) + +var ( + conerr error +) + +func TestStartStop(t *testing.T) { + c := &CassandraContext{} + conerr = c.StartCassandra("bgp_mongol_test", "worf.netsec.colostate.edu") + if conerr != nil { + t.Logf("could not conect to test cassandra instance at worf.netsec.colostate.edu. not failing test") + return + } + c.StopCassandra() +} + +func TestCreateTable(t *testing.T) { + c := &CassandraContext{} + conerr = c.StartCassandra("bgp_mongol_test", "worf.netsec.colostate.edu") + if conerr != nil { + t.Logf("could not conect to test cassandra instance at worf.netsec.colostate.edu. not failing test") + return + } + if err := c.session.Query("CREATE TABLE bmf ( prefix text PRIMARY KEY, dat1 text, dat2 text);").Exec(); err != nil { + t.Logf(" create table returned error :%v \n", err) + } + c.StopCassandra() +} diff --git a/mrt/mrt.go b/mrt/mrt.go @@ -0,0 +1,377 @@ +package mrt + +import ( + "bytes" + "encoding/binary" + "errors" + "fmt" + "io" + "log" + "net" + "os" + "unicode/utf8" + //"runtime" +) + +var logger = log.New(os.Stderr, "go-mrt: ", log.Ldate|log.Llongfile) + +type MrtHdr struct { + Mrt_timestamp uint32 + Mrt_type uint16 + Mrt_subtype uint16 + Mrt_len uint32 +} + +type parsefunc func([]byte) MrtSubTyper + +type MrtMsg struct { + Hdr MrtHdr + BGPMsg []byte +} + +type MrtSubTyper interface { + Type() string //almost dummy functionality + String() string +} + +type MrtOSPFHdr struct { + otype uint16 + RemoteIP uint32 + LocalIP uint32 +} + +func (m *MrtOSPFHdr) Type() string { + return "OSPFHdr" +} + +func (m *MrtOSPFHdr) String() string { + remip := make(net.IP, 4) + locip := make(net.IP, 4) + remip[0] = byte(m.RemoteIP) + remip[1] = byte(m.RemoteIP >> 8) + remip[2] = byte(m.RemoteIP >> 16) + remip[3] = byte(m.RemoteIP >> 24) + locip[0] = byte(m.LocalIP) + locip[1] = byte(m.LocalIP >> 8) + locip[2] = byte(m.LocalIP >> 16) + locip[3] = byte(m.LocalIP >> 24) + return fmt.Sprintf("OSPF Header. Type [%d] Remote IP [%s] Local IP [%s]", m.otype, remip, locip) +} + +type MrtInfoMsg struct { + inftype uint16 + optmsg string +} + +func (m *MrtInfoMsg) String() string { + return fmt.Sprintf("Informational Message. Type [%v] Optstring [%s]", m.inftype, m.optmsg) +} + +func (m *MrtInfoMsg) Type() string { + return m.String() +} + +func (m *MrtMsg) PFunc() (ret parsefunc, ok bool) { + var subtype = m.Hdr.Mrt_subtype + var mtype = m.Hdr.Mrt_type + infofunc := func(a []byte) MrtSubTyper { + runes := []rune{} + infomsg := &MrtInfoMsg{inftype: mtype, optmsg: "No Optional Message"} + for len(a) > 0 { + r, sz := utf8.DecodeRune(a) + if r == utf8.RuneError { + logger.Println("failed to decode rune in optional message") + return infomsg + } + a = a[sz:] + runes = append(runes, r) + } + if len(runes) > 0 { + infomsg.optmsg = string(runes) + } + return infomsg + } + + ospffunc := func(a []byte) MrtSubTyper { + ret := &MrtOSPFHdr{otype: subtype} + buf := bytes.NewReader(a) + err := binary.Read(buf, binary.BigEndian, &ret.RemoteIP) + err = binary.Read(buf, binary.BigEndian, &ret.LocalIP) + if err != nil { + panic(fmt.Sprintf("error while reading binary OSPF header: %s", err)) + } + return ret + } + + bgp4mpscfunc := func(a []byte) MrtSubTyper { + ret := &MrtBGP4MPStateChangeHdr{} + buf := bytes.NewReader(a) + if subtype == BGP4MP_STATE_CHANGE { + ret.PeerASN = make([]byte, 2) + ret.LocalASN = make([]byte, 2) + } else { + ret.PeerASN = make([]byte, 4) + ret.LocalASN = make([]byte, 4) + } + err := binary.Read(buf, binary.BigEndian, &ret.PeerASN) + if err != nil { + panic(fmt.Sprintf("error while reading binary BGP4MP header: %s", err)) + } + binary.Read(buf, binary.BigEndian, &ret.LocalASN) + binary.Read(buf, binary.BigEndian, &ret.InterfaceInd) + binary.Read(buf, binary.BigEndian, &ret.AddrFamily) + if ret.AddrFamily == 1 { + ret.PeerIP = make([]byte, 4) + ret.LocalIP = make([]byte, 4) + } else if ret.AddrFamily == 2 { + ret.PeerIP = make([]byte, 16) + ret.LocalIP = make([]byte, 16) + } + binary.Read(buf, binary.BigEndian, &ret.PeerIP) + binary.Read(buf, binary.BigEndian, &ret.LocalIP) + binary.Read(buf, binary.BigEndian, &ret.OldState) + binary.Read(buf, binary.BigEndian, &ret.NewState) + return ret + } + + bgp4mpmsgfunc := func(a []byte) MrtSubTyper { + ret := &MrtBGP4MPMsgHdr{} + buf := bytes.NewReader(a) + if subtype == BGP4MP_MESSAGE { + ret.PeerASN = make([]byte, 2) + ret.LocalASN = make([]byte, 2) + } else if subtype == BGP4MP_MESSAGE_AS4 { + ret.PeerASN = make([]byte, 4) + ret.LocalASN = make([]byte, 4) + } + err := binary.Read(buf, binary.BigEndian, &ret.PeerASN) + if err != nil { + panic(fmt.Sprintf("error while reading binary BGP4MP header: %s", err)) + } + binary.Read(buf, binary.BigEndian, &ret.LocalASN) + binary.Read(buf, binary.BigEndian, &ret.InterfaceInd) + //fmt.Printf("ADdr family should be:%v\n", binary.BigEndian.Uint16(a[6:8])) + binary.Read(buf, binary.BigEndian, &ret.AddrFamily) + if ret.AddrFamily == 1 { + ret.PeerIP = make([]byte, 4) + ret.LocalIP = make([]byte, 4) + } else if ret.AddrFamily == 2 { + ret.PeerIP = make([]byte, 16) + ret.LocalIP = make([]byte, 16) + } else { + panic("Address Family in BGP4MP msg func is wrong") + } + binary.Read(buf, binary.BigEndian, &ret.PeerIP) + binary.Read(buf, binary.BigEndian, &ret.LocalIP) + return ret + } + + ret = nil + ok = false + switch mtype { + case MSG_PROTOCOL_BGP4MP: + if subtype == BGP4MP_STATE_CHANGE || subtype == BGP4MP_STATE_CHANGE_AS4 { + ret, ok = bgp4mpscfunc, true + } else if subtype == BGP4MP_MESSAGE || subtype == BGP4MP_MESSAGE_AS4 || + subtype == BGP4MP_MESSAGE_LOCAL || subtype == BGP4MP_MESSAGE_AS4_LOCAL { + ret, ok = bgp4mpmsgfunc, true + } + case MSG_START, MSG_I_AM_DEAD: + if subtype == 0 { + ret, ok = infofunc, true + } else { + logger.Println("Mrt type is Informational but Subtype non-zero") + } + case MSG_PROTOCOL_OSPF: + if subtype == 0 || subtype == 1 { + ret, ok = ospffunc, true + } else { + logger.Println("Mrt type is OSPF but Subtype is neither 0 or 1") + } + case MSG_NULL, MSG_DIE, MSG_PEER_DOWN, MSG_PROTOCOL_BGP, MSG_PROTOCOL_IDRP, MSG_PROTOCOL_BGP4PLUS, MSG_PROTOCOL_BGP4PLUS1: + logger.Println("Deprecated message type") + default: + logger.Printf("unknown. header [%v]\n", m.Hdr) + } + return +} + +type MrtBGP4MPStateChangeHdr struct { + PeerASN []byte + LocalASN []byte + InterfaceInd uint16 + AddrFamily uint16 + PeerIP []byte + LocalIP []byte + OldState uint16 + NewState uint16 +} + +func (m *MrtBGP4MPStateChangeHdr) Type() string { + return "BGP4MPStateChange" +} + +func (m *MrtBGP4MPStateChangeHdr) String() string { + return "BGP4MPStateChange" +} + +type MrtBGP4MPMsgHdr struct { + PeerASN []byte + LocalASN []byte + InterfaceInd uint16 + AddrFamily uint16 + PeerIP []byte + LocalIP []byte +} + +func (m *MrtBGP4MPMsgHdr) Type() string { + return "BGP4MPMsg" +} + +func (m *MrtBGP4MPMsgHdr) String() string { + if len(m.PeerIP) < 4 || len(m.LocalIP) < 4 { + return "BGP4MPMsg unable to read IPs" + } + return fmt.Sprintf("LocalIP:%s RemoteIP:%s", net.IPv4(m.PeerIP[0], m.PeerIP[1], m.PeerIP[2], m.PeerIP[3]), net.IPv4(m.LocalIP[0], m.LocalIP[1], m.LocalIP[2], m.LocalIP[3])) +} + +type MrtTableDumpV1Hdr struct { + ViewNum uint16 + SeqNum uint16 + Prefix []byte + PrefixLen uint8 + Status uint8 + OrigTime uint32 + PeerIP []byte + PeerAS uint16 + AttrLen uint16 +} + +func (m *MrtTableDumpV1Hdr) Type() string { + return "TableDumpV1Hdr" +} + +func (m *MrtTableDumpV1Hdr) String() string { + return "TableDumpV1Hdr" +} + +type MrtFile struct { + file io.Reader + entries uint32 + off int64 +} + +const ( + MrtHdr_size = 12 + dump_size = 10000 +) + +// mrt-type consts +const ( + MSG_NULL = iota // 0 empty msg (deprecated) + MSG_START // 1 sender is starting up + MSG_DIE // 2 receiver should shut down (deprecated) + MSG_I_AM_DEAD // 3 sender is shutting down + MSG_PEER_DOWN // 4 sender's peer is down (deprecated) + MSG_PROTOCOL_BGP // 5 msg is a BGP packet (deprecated) + MSG_PROTOCOL_RIP // 6 msg is a RIP packet + MSG_PROTOCOL_IDRP // 7 msg is an IDRP packet (deprecated) + MSG_PROTOCOL_RIPNG // 8 msg is a RIPNG packet + MSG_PROTOCOL_BGP4PLUS // 9 msg is a BGP4+ packet (deprecated) + MSG_PROTOCOL_BGP4PLUS1 // 10 msg is a BGP4+ (draft 01) (deprecated) + MSG_PROTOCOL_OSPF // 11 msg is an OSPF packet + MSG_TABLE_DUMP // 12 routing table dump + MSG_TABLE_DUMP_V2 // 13 routing table dump + MSG_PROTOCOL_BGP4MP = 16 // 16 zebras own packet format + MSG_PROTOCOL_BGP4MP_ET = 17 + MSG_PROTOCOL_ISIS = 32 // 32 msg is a ISIS package + MSG_PROTOCOL_ISIS_ET = 33 + MSG_PROTOCOL_OSPFV3 = 48 // 48 msg is a OSPFv3 package + MSG_PROTOCOL_OSPFV3_ET = 49 +) + +// mrt-subtype consts +const ( + BGP4MP_STATE_CHANGE = 0 // state change + BGP4MP_MESSAGE = 1 // bgp message + BGP4MP_MESSAGE_AS4 = 4 // same as BGP4MP_MESSAGE with 4byte AS + BGP4MP_STATE_CHANGE_AS4 = 5 + BGP4MP_MESSAGE_LOCAL = 6 // same as BGP4MP_MESSAGE but for self + BGP4MP_MESSAGE_AS4_LOCAL = 7 // originated updates. Not implemented +) + +const ( + OSPF_STATE_CHANGE = iota + OSPF_LSA_UPDATE +) + +func NewMrtHdr(b []byte) (ret MrtHdr, err error) { + buf := bytes.NewReader(b) + err = binary.Read(buf, binary.BigEndian, &ret) + return +} + +func NewMrtFile(f io.Reader) (ret MrtFile) { + ret = MrtFile{f, 0, 0} + return +} + +//This function can be passed into a bufio.Scanner.Split() to read buffered +//mrt msgs +func SplitMrt(data []byte, atEOF bool) (advance int, token []byte, err error) { + if atEOF && len(data) == 0 { + return 0, nil, nil + } + if cap(data) < MrtHdr_size { // read more + return 0, nil, nil + } + //this reads the data and (they are big endian so it handles that) + hdr, errh := NewMrtHdr(data[:MrtHdr_size]) + if errh != nil { + return 0, nil, errh + } + totlen := int(hdr.Mrt_len + MrtHdr_size) + if len(data) < totlen { //need to read more + return 0, nil, nil + } + //logger.Printf("scanned mrt with len:%d datalen is :%d", totlen, len(data)) + return totlen, data[0:totlen], nil +} + +func (f *MrtFile) Read(b []byte) (n int, err error) { + //fmt.Printf(" b len:%v cap:%v\n",len(b), cap(b)) + if cap(b) < MrtHdr_size { + err = errors.New("buffer size less than header size") + return + } + n, err = f.file.Read(b[:MrtHdr_size]) + if err != nil { + return + } + hdr, errh := NewMrtHdr(b[:MrtHdr_size]) + if errh != nil { + err = errors.New(fmt.Sprintf("error in reading header from offset %v : %s", f.off, errh)) + return + } + //fmt.Printf("got header at offset:%d ! :%v\n", f.off, hdr) + //n = int(hdr.Mrt_len+MrtHdr_size) + //f.off = f.off + int64(n) + f.entries = f.entries + 1 + //this will just jump over the msg + //noff,errs := f.file.Seek(int64(hdr.Mrt_len), os.SEEK_CUR) + if dump_size-(hdr.Mrt_len+MrtHdr_size) <= 0 { + err = errors.New(fmt.Sprintf("bgp message of size:%v at offset is too large", hdr.Mrt_len, f.off+MrtHdr_size)) + return + } + //fmt.Printf("i will access b[%v:%v] len:%v cap:%v\n",MrtHdr_size, hdr.Mrt_len+MrtHdr_size, len(b), cap(b)) + nr, err := f.file.Read(b[MrtHdr_size : hdr.Mrt_len+MrtHdr_size]) + if nr != int(hdr.Mrt_len) { + n = n + nr //header + len of read + err = errors.New(fmt.Sprintf("error in reading bgp message of size :%v . got :%v bytes.", hdr.Mrt_len, n)) + return + } + n = n + nr + f.off += int64(n) + //fmt.Printf("seeked at offset:%d \n", f.off) + return +} diff --git a/mrt/mrt_test.go b/mrt/mrt_test.go @@ -0,0 +1,118 @@ +package mrt + +import ( + "bufio" + "bytes" + "encoding/binary" + "fmt" + "net" + "os" + "testing" +) + +func TestMrtHdr(t *testing.T) { + buf := new(bytes.Buffer) + var tdate, tlen uint32 = 1, 4 + var ttype, tsubtype uint16 = 2, 3 + mrt := &MrtHdr{tdate, ttype, tsubtype, tlen} + fmt.Printf("date:%v type:%v subtype:%v len:%v\n", tdate, ttype, tsubtype, tlen) + binary.Write(buf, binary.BigEndian, mrt) + fmt.Printf("binary mrt: %x\n", buf.Bytes()) + mhdr, err := NewMrtHdr(buf.Bytes()) + if err != nil { + t.Fatal(err) + } + fmt.Printf("recreating MrtHdr from binary :%+v \n", mhdr) +} + +func TestMrtPFunc(t *testing.T) { + var ( + tt1, ts1 = uint16(1), uint16(0) //start + tt2, ts2 = uint16(3), uint16(1) //i am dead , but wrong subtype + tt3, ts3 = uint16(2), uint16(0) //deprecated + tt4, ts4 = uint16(11), uint16(0) //ospf state change + tbuf = []byte{0, 0, 0, 0, 0, 0, 0, 0} + tf parsefunc + ok bool + ) + //binbuf := new(bytes.Buffer) + mrt1 := &MrtMsg{ + Hdr: MrtHdr{1, tt1, ts1, 10}, + BGPMsg: tbuf, + } + mrt2 := &MrtMsg{ + Hdr: MrtHdr{1, tt2, ts2, 10}, + BGPMsg: tbuf, + } + mrt3 := &MrtMsg{ + Hdr: MrtHdr{1, tt3, ts3, 10}, + BGPMsg: tbuf, + } + mrt4 := &MrtMsg{ + Hdr: MrtHdr{1, tt4, ts4, 10}, + BGPMsg: tbuf, + } + fmt.Println("trying to parse informational message") + if tf, ok = mrt1.PFunc(); !ok { + t.Fatal("tf should be non nil") + } + hdr := tf(mrt1.BGPMsg) + fmt.Printf("type is :%s\n", hdr.Type()) + fmt.Println("trying to parse informational message with opt string") + mrt1.BGPMsg = []byte{'f', 'o', 'o', ' ', 's', 't', 'r'} + mrt1.Hdr.Mrt_type = tt2 + if tf, ok = mrt1.PFunc(); !ok { + t.Fatal("tf should be non nil") + } + hdr = tf(mrt1.BGPMsg) + fmt.Printf("type is :%s\n", hdr.Type()) + fmt.Println("trying to parse malformed informational message") + if tf, ok = mrt2.PFunc(); ok { + t.Fatal("this should fail with tf being nil cause subtype is non-0") + } + fmt.Println("trying to parse deprecated message") + if tf, ok = mrt3.PFunc(); ok { + t.Fatal("this should fail with tf being nil cause it's deprecated") + } + fmt.Println("trying to parse OSPF message") + //first call to littleendian to come to hostbyteorder and then switch to big + binary.BigEndian.PutUint32(mrt4.BGPMsg[:4], binary.LittleEndian.Uint32(net.IPv4(1, 2, 3, 4).To4())) + binary.BigEndian.PutUint32(mrt4.BGPMsg[4:], binary.LittleEndian.Uint32(net.IPv4(5, 6, 7, 8).To4())) + //binary.Write(binbuf, binary.BigEndian, net.IPv4allsys.To4()) + //mrt4.BGPMsg = make([]byte,8) + //mrt4.BGPMsg = binbuf.Bytes() + //copy(mrt4.BGPMsg,binbuf.Bytes()) + if tf, ok = mrt4.PFunc(); !ok { + t.Fatal("this shouldn't fail") + } + hdr = tf(mrt4.BGPMsg) + fmt.Printf("type is :%s .String representation: %s\n", hdr.Type(), hdr) +} + +func TestScan(t *testing.T) { + fmt.Println("testing the scanner interface") + f, err := os.Open("../tests/mrt3") + if err != nil { + t.Fatal(err) + } + mrtscanner := bufio.NewScanner(f) + mrtscanner.Split(SplitMrt) + count := 0 + for mrtscanner.Scan() { + count++ + dat := mrtscanner.Bytes() + h, _ := NewMrtHdr(dat[:MrtHdr_size]) /* the error has been checked in Read() */ + if h.Mrt_len == 0 { + t.Logf("terminating from 0 mrt len") + return + } + mrtmsg := MrtMsg{Hdr: h, BGPMsg: dat[MrtHdr_size:]} + if tf, ok := mrtmsg.PFunc(); ok { + tf(mrtmsg.BGPMsg) + } + } + if err := mrtscanner.Err(); err != nil { + fmt.Printf("error: %s", err) + } + fmt.Printf("scanned and parsed: %d entries from bufio\n", count) +} diff --git a/tests/mrt1 b/tests/mrt1 Binary files differ. diff --git a/tests/mrt2 b/tests/mrt2 Binary files differ. diff --git a/tests/mrt3 b/tests/mrt3 Binary files differ.