Grafana(10445) +Pormetheus + Nginx 监控 Http API 2xx 3xx 4xx

安装

服务端性能监控最佳实践(一)—— 炫酷的Nginx请求分析监控

其中涉及的lua脚本等,github地址
不过其中的代码有问题,我fork了一份,修改后传到新地址了,具体问题见后续分析

nginx+OpenResty

Nginx 平滑升级至 OpenResty
Nginx的启动、停止与重启

bug

安装后,nginx日志报错

2021/01/23 14:38:21 [error] 85214#0: *13184 access forbidden by rule, client: 10.10.10.10, server: localhost, request: "HEAD / HTTP/1.0"
2021/01/23 14:38:21 [error] 85214#0: *13184 [lua] counter.lua:66: log(): latency=0,status=403,endpoint=nil,fullurl=nil while logging request, client: 10.10.10.10, server: localhost, request: "HEAD / HTTP/1.0"
2021/01/23 14:38:21 [error] 85214#0: *13184 [lua] prometheus.lua:317: log_error(): Wrong number of labels for nginx_http_request_duration_seconds. Expected 6, got 4 while logging request, client: 10.10.10.10, server: localhost, request: "HEAD / HTTP/1.0"

看起来是lua脚本有问题,debug看看

Mac+Idea+lua

Emmylua

在这里插入图片描述

IDEA+EmmyLua Lua开发环境搭建
在 mac osx 下进行 ulua 远程调试
其中如果电脑是macpro m1 (silicon架构) 架构可以加
-arch arm64
最终没搞定

Lua

改为用lua插件

在这里插入图片描述
在这里插入图片描述

bug fix

单步调试搞定后,发现counter.lua有问题
原代码

local pcall = pcall
local ngx = ngx
local ngx_log = ngx.log
local ngx_err = ngx.ERR
local _M = {}
function _M.init()
    uris = ngx.shared.uri_by_host
    global_set = ngx.shared.global_set
    global_set:set("initted", false)
    global_set:set("looped", false)
    prometheus = require("prometheus").init("prometheus_metrics") 
    metric_latency = prometheus:histogram("nginx_http_request_duration_seconds", "HTTP request latency status", {"host", "status", "scheme", "method", "endpoint", "fullurl"})
end
local function split(inputstr, sep)
    if sep == nil then
        sep = "%s"
    end
    local t={} ; i=1
    for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
        t[i] = str
        i = i + 1
    end
    return t
end
local function parse_fullurl(request_uri)
    result_table = {}
    if string.find(request_uri, "%.") ~= nil then
       return nil
    end
    parts = split(request_uri, "/")
    if table.getn(parts) == 1 then
       return nil
    end
    for j=1, #parts do
       if(j == 1) then
           endpoint = "/"..parts[j]
           fullurl = "/"..parts[j]
       elseif(j <= 5) then
           if tonumber(parts[j]) ~= nil then
               break
           end
           fullurl = fullurl.."/"..parts[j]
       else
           break
       end
    end
    result_table["endpoint"] = endpoint
    result_table["fullurl"] = fullurl
    return result_table
end
function _M.log()
    local request_host = ngx.var.host
    local request_uri = ngx.unescape_uri(ngx.var.uri)
    local request_status = ngx.var.status
    local request_scheme = ngx.var.scheme
    local request_method = ngx.var.request_method
    local remote_ip = ngx.var.remote_addr
    local ngx_sent = ngx.var.body_bytes_sent
    local latency = ngx.var.upstream_response_time or 0


    result_table = parse_fullurl(request_uri)
    if result_table == nil then
        return
    end
    ngx_log(ngx_err,"latency=", tonumber(latency), ",status=", request_status, ",endpoint=", result_table["endpoint"], ",fullurl=", result_table["fullurl"])
    metric_latency:observe(tonumber(latency), {request_host, request_status, request_scheme, request_method, result_table["endpoint"], result_table["fullurl"]})
end
return _M

其中有2个问题

if string.find(request_uri, “%.”) ~= nil then
改为
if string.find(request_uri, “%.”) == nil then
~=在lua里表示不等于。lua的find会返回2个值,这里应该是url查找任意字符,从代码上看逻辑是检查字符串长度是否为0
另外一个
table.getn(parts) == 1
lua升级到5.1后,不再支持getn,改为 #
#parts == 1

正确脚本

counter.lua

local pcall = pcall
local ngx = ngx
local ngx_log = ngx.log
local ngx_err = ngx.ERR
local _M = {}
function _M.init()
    uris = ngx.shared.uri_by_host
    global_set = ngx.shared.global_set
    global_set:set("initted", false)
    global_set:set("looped", false)
    prometheus = require("prometheus").init("prometheus_metrics") 
    metric_latency = prometheus:histogram("nginx_http_request_duration_seconds", "HTTP request latency status", {"host", "status", "scheme", "method", "endpoint", "fullurl"})
end
local function split(inputstr, sep)
    if sep == nil then
        sep = "%s"
    end
    local t={} ; i=1
    for str in string.gmatch(inputstr, "([^"..sep.."]+)") do
        t[i] = str
        i = i + 1
    end
    return t
end
local function parse_fullurl(request_uri)
    result_table = {}
    if string.find(request_uri, "%.") == nil then
       return nil
    end
    parts = split(request_uri, "/")
    if #parts == 1 then
       return nil
    end
    for j=1, #parts do
       if(j == 1) then
           endpoint = "/"..parts[j]
           fullurl = "/"..parts[j]
       elseif(j <= 5) then
           if tonumber(parts[j]) ~= nil then
               break
           end
           fullurl = fullurl.."/"..parts[j]
       else
           break
       end
    end
    result_table["endpoint"] = endpoint
    result_table["fullurl"] = fullurl
    return result_table
end
function _M.log()
    local request_host = ngx.var.host
    local request_uri = ngx.unescape_uri(ngx.var.uri)
    local request_status = ngx.var.status
    local request_scheme = ngx.var.scheme
    local request_method = ngx.var.request_method
    local remote_ip = ngx.var.remote_addr
    local ngx_sent = ngx.var.body_bytes_sent
    local latency = ngx.var.upstream_response_time or 0


    result_table = parse_fullurl(request_uri)
    if result_table == nil then
        return
    end
    ngx_log(ngx_err,"latency=", tonumber(latency), ",status=", request_status, ",endpoint=", result_table["endpoint"], ",fullurl=", result_table["fullurl"])
    metric_latency:observe(tonumber(latency), {request_host, request_status, request_scheme, request_method, result_table["endpoint"], result_table["fullurl"]})
end
return _M

请求http://xxx.xxxx.xxx.xxx:9145/metrics
正常返回了
在这里插入图片描述

配置prometheus + Grafana

按照文章配置好后,发现页面endpoint, host等变量有数据,但是图表没有东西
在这里插入图片描述
找个panel点击编辑后
在这里插入图片描述
发现这里的 nginx_http_request_duration_seconds:qps_by_instance_host_endpoint_fullurl_2XX ,写的job,但是grafana里没有找到对应的定义,prometheus里也没有。
google了一下,发现
GIt地址
里定义了
在这里插入图片描述
按照这个prometheus.yml里的rule的定义,拷贝文件到服务器
在这里插入图片描述
再看grafana有图了
在这里插入图片描述

改进

抓回数据后,发现有对静态资源的请求也在日志范围内,如果想去掉静态请求,可以修改counter.lua文件,例如

    if string.find(request_uri, ".html") ~= nil then
        return nil
    end

    if string.find(request_uri, ".js") ~= nil then
        return nil
    end

    if string.find(request_uri, ".css") ~= nil then
        return nil
    end

    if string.find(request_uri, ".png") ~= nil then
        return nil
    end

    if string.find(request_uri, "/static/") ~= nil then
        return nil
    end
已标记关键词 清除标记
©️2020 CSDN 皮肤主题: 大白 设计师:CSDN官方博客 返回首页