OTel Me a Story

Observing code in production with OpenTelemetry (OTEL)

https://giphy.com/gifs/creative-thinking-thinker-VbEuHLBUPQm55MyqJg
while True:
    try:
        x = int(input("Enter a number: "))
        break
    except ValueError:
        print("Oops!")

Observability

Observability

Metrics, Logs and Traces

Metrics

Metrics

Logs

Photo by Oliver Paaske on Unsplash

Logs

E0419 06:59:50.681167       1 reflector.go:138] pkg/mod/k8s.io/client-go@v0.22.4/tools/cache/reflector.go:167: Failed to watch *v1.Endpoints: the server has asked for the client to provide credentials (get endpoints)

time="2024-04-17T20:10:41Z" level=info msg="successfully synced configuration to kong." subsystem=proxy-cache-resolver

Logs

Logs

Logs

Traces

Traces

get '/posts' do
  Datadog::Tracing.trace('web.request', service: 'my-blog', resource: 'GET /posts') do |span|
    # Trace the activerecord call
    Datadog::Tracing.trace('posts.fetch') do
      @posts = Posts.order(created_at: :desc).limit(10)
    end

    # Add some APM tags
    span.set_tag('http.method', request.request_method)
    span.set_tag('posts.count', @posts.length)

    # Trace the template rendering
    Datadog::Tracing.trace('template.render') do
      erb :index
    end
  end
end

Traces

Traces

Traces

1.

2.

OTel captures traces, metrics, logs, and other application telemetry data and lets you send it to the backend of your choice.


 

OpenTelemetry (OTEL)

OTel ≈ Observability api + spec

OTel me, code, why are you slow?

// main.go
func main() {
	http.HandleFunc("/rolldice", rolldice)
	log.Fatal(http.ListenAndServe(":8000", nil))
}
// rolldice.go 
func rolldice(w http.ResponseWriter, r *http.Request) {
	roll := 1 + rand.Intn(6)

	// Sleep for a few seconds, so we can see longer traces
	// And breakout the slow api call versus other work
	// that might be done in this function, i.e. a DB query
	time.Sleep(time.Millisecond * time.Duration(roll*100))

	log.Println("simulating a call to slow api")
	err := simulateSlowAPI(500 * roll)
	if err != nil {
		log.Printf("Failed to make call to simulate slow api")
	}

	resp := strconv.Itoa(roll) + "\n"
	if _, err := io.WriteString(w, resp); err != nil {
		log.Printf("Write failed: %v\n", err)
	}
}
// rolldice.go
// A function for simulating slowness with an external http call
func simulateSlowAPI(sleepInMilliseconds int) error {
    // Construct the URL with the sleep parameter
	baseURL := "https://fakeresponder.com/"
	sleepParam := "?sleep=" + strconv.Itoa(sleepInMilliseconds)

	// Create the full URL
	fullURL := baseURL + sleepParam

	// Make the GET request
	resp, err := http.Get(fullURL)
	if err != nil {
		log.Println("Error making the GET request:", err)
		return err
	}
// > go run .

2024/04/24 22:03:11 simulating a call to slow api
2024/04/24 22:03:16 simulating a call to slow api
2024/04/24 22:03:20 simulating a call to slow api
2024/04/24 22:03:23 simulating a call to slow api

// > watch -n 2 'curl -s localhost:8000/rolldice
Every 2.0s: curl -s localhost:8000/rolldice

4
## TODO: find out what's slow

_Assuming we didn't already know_ :)

- [ ] Add more logs
- [ ] Add metrics (response time, number of calls, etc)

*OR*

- [x] Add OTel
// main.go
	import "go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp"

	// Set up OpenTelemetry.
	otelShutdown, err := setupOTelSDK(ctx)
	if err != nil {
		return
	}
	// Handle shutdown properly so nothing leaks.
	defer func() {
		err = errors.Join(err, otelShutdown(context.Background()))
	}()
    
    ...
    
    handleFunc := func(pattern string, handlerFunc func(http.ResponseWriter, *http.Request)) {
	// Configure the "http.route" for the HTTP instrumentation.
	handler := otelhttp.WithRouteTag(pattern, http.HandlerFunc(handlerFunc))
	mux.Handle(pattern, handler)

    handler := otelhttp.NewHandler(mux, "/")
// otel.go

func setupOTelSDK(ctx context.Context) (shutdown func(context.Context) error, err error) {

	// Set up trace provider.
	tracerProvider, err := newTraceProvider()
	if err != nil {
		handleErr(err)
		return
	}
	shutdownFuncs = append(shutdownFuncs, tracerProvider.Shutdown)
	otel.SetTracerProvider(tracerProvider)
    
    case "stdout":
	  exporter, err = stdouttrace.New(
        stdouttrace.WithPrettyPrint()
      )


// rolldice.go
var (
	tracer  = otel.Tracer("rolldice")
	meter   = otel.Meter("rolldice")
	rollCnt metric.Int64Counter
)

func rolldice(w http.ResponseWriter, r *http.Request) {
	ctx, span := tracer.Start(r.Context(), "rolling dice")
	defer span.End()

	roll := 1 + rand.Intn(6)

	// Sleep for a few seconds, so we can see longer traces
	time.Sleep(time.Millisecond * time.Duration(roll*100))

	span.AddEvent("simulating a call to slow api")
	err := simulateSlowAPI(500*roll, ctx)
	if err != nil {
		log.Printf("Failed to make call to simulate slow api")
	}
// rolldice.go

func simulateSlowAPI(sleepInMilliseconds int, ctx context.Context) error {
	ctx, childSpan := tracer.Start(ctx, "simulatedSlowApiSpan")
	defer childSpan.End()
	// Construct the URL with the sleep parameter
	baseURL := "https://fakeresponder.com/"
	sleepParam := "?sleep=" + strconv.Itoa(sleepInMilliseconds)
(base) ➜  otel-dice-example git:(otel-with-stdout) ✗ go run .

{"Resource":[
  {"Key":"service.name","Value":
   {"Type":"STRING","Value":"dice"}
  },
  {"Key":"service.version","Value":
   {"Type":"STRING","Value":"0.1.0"}
  },
  {"Key":"telemetry.sdk.language","Value":
   {"Type":"STRING","Value":"go"}
  },
  {"Key":"telemetry.sdk.name","Value":
   {"Type":"STRING","Value":"opentelemetry"}
  },
  {"Key":"telemetry.sdk.version","Value":
   {"Type":"STRING","Value":"1.25.0"}
  }
],"ScopeMetrics":[]}

{"Resource":[{"Key":"service.name","Value":{"Type":"STRING","Value":"dice"}},{"Key":"service.version","Value":{"Type":"STRING","Value":"0.1.0"}},{"Key":"telemetry.sdk.language","Value":{"Type":"STRING","Value":"go"}},{"Key":"telemetry.sdk.name","Value":{"Type":"STRING","Value":"opentelemetry"}},{"Key":"telemetry.sdk.version","Value":{"Type":"STRING","Value":"1.25.0"}}],"ScopeMetrics":[]}
{
	"Name": "/",
	"SpanContext": {
		"TraceID": "5a844fd338c4c00ec001fb28ea352c64",
		"SpanID": "4908426582a52298",
	},
	"Parent": {
		"TraceID": "00000000000000000000000000000000",
		"SpanID": "0000000000000000",
	},
	"SpanKind": 2,
	"StartTime": "2024-04-25T00:21:00.179788-06:00",
	"EndTime": "2024-04-25T00:21:01.471713625-06:00",
	"Attributes": [
		{
			"Key": "http.route",
			"Value": {
				"Type": "STRING",
				"Value": "/rolldice"
			}
		},
{
	"Name": "rolling dice",
	"SpanContext": {
		"TraceID": "5a844fd338c4c00ec001fb28ea352c64",
		"SpanID": "7e394450317027c5",
	},
	"Parent": {
		"TraceID": "5a844fd338c4c00ec001fb28ea352c64",
		"SpanID": "4908426582a52298",
	},
    "Attributes": [
		{
			"Key": "roll.value",
			"Value": {
				"Type": "INT64",
				"Value": 2
			}
		}
	],

🎲

> docker run -d --name jaeger \
  -e COLLECTOR_OTLP_ENABLED=true \
  -p 16686:16686 \
  -p 4317:4317 \
  -p 4318:4318 \
  jaegertracing/all-in-one:latest
  

> OTEL_EXPORTER_OTLP_ENDPOINT=http://localhost:4318 go run .
// otel.go

func createOtelExporter(exporterType string) (trace.SpanExporter, error) {

    case "otlp":
		var opts []otlptracehttp.Option
	
		exporter, err = otlptrace.New(
			context.Background(),
			otlptracehttp.NewClient(opts...),
		)
	case "stdout":
		exporter, err = stdouttrace.New(stdouttrace.WithPrettyPrint())

OTel your future self

Win a print copy of Observability Engineering

bit.ly/vc-hc-obsengbook​

Observability

Thank you!

OTel Me a Story: Observing Code in Production

By virtualandy

OTel Me a Story: Observing Code in Production

You shipped your code and someone says it’s busted. How can it be?! Observability is a super power to help you diagnose your systems. OpenTelemetry - or OTEL - is a set of APIs, SDKs and specs for observability. We’ll briefly cover Observability and how to get started so you can analyze and improve your software.

  • 35