Skip to main content

Telegraf

Installation

RHEL
cat <<EOF | sudo tee /etc/yum.repos.d/influxdb.repo
[influxdb]
name = InfluxData Repository - Stable
baseurl = https://repos.influxdata.com/stable/\$basearch/main
enabled = 1
gpgcheck = 1
gpgkey = https://repos.influxdata.com/influxdata-archive_compat.key
EOF

sudo yum install telegraf
Ubuntu/Debian
curl -s https://repos.influxdata.com/influxdata-archive_compat.key > influxdata-archive_compat.key
echo '393e8779c89ac8d958f81f942f9ad7fb82a25e133faddaf92e15b16e6ac9ce4c influxdata-archive_compat.key' | sha256sum -c && cat influxdata-archive_compat.key | gpg --dearmor | sudo tee /etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg > /dev/null
echo 'deb [signed-by=/etc/apt/trusted.gpg.d/influxdata-archive_compat.gpg] https://repos.influxdata.com/debian stable main' | sudo tee /etc/apt/sources.list.d/influxdata.list
sudo apt-get update && sudo apt-get install telegraf

Configuration

telegraf config > telegraf.conf

Outputs.InfluxDB v1
###############################################################################
#                            OUTPUT PLUGINS                                   #
###############################################################################


# Configuration for sending metrics to InfluxDB
[[outputs.influxdb]]
  urls = ["http://influxdb.server.ip.addr:8086"]
  database = "db-name"
  timeout = "0s"
  username = "db-user"
  password = "db-pass"
Outputs.InfluxDB v2
###############################################################################
#                            OUTPUT PLUGINS                                   #
###############################################################################

[[outputs.influxdb_v2]]
  urls = ["http://influxdb.server.ip.addr:8086"]
  token = "example-token"
  organization = "example-org"
  bucket = "example-bucket"
Inputs.exec

data_format = "influx"

資料格式:

measurement,tag1=val1,tag2=val2 field1="v1",field2=1 0000000000000000000
  • 0000000000000 欄位 Timestamp 是選擇性,預設是系統時間。
  • 詳細教學:Line protocol | InfluxDB OSS v2 Documentation (influxdata.com)

  • Scripts

    Samples #1

    #/bin/bash
    
    devname=(`lsblk| grep 'disk'|awk '{print $1}'`)
    dirname=(`lsblk| grep 'disk'|awk '{if ($7=="") print "/";else print $7}'`)
    #At that time, I wanted to store these directory names in dictionary format, and later changed to variable mode, shell Of[ ] { } * @ $Special characters will drive you crazy
    #declare -A devdict
    devnum=`expr ${#devname[@]} - 1`
    for i in `seq 0 $devnum`;do
      if [-z "${dirname[$i]}" ];then
        eval ${devname[$i]}="/"
      else
        eval ${devname[$i]}="${dirname[$i]}"
      fi
      #devdict+=([${devname[$i]}]="${dirname[$i]}")
    done
    #echo ${!devdict[*]}
    #echo ${devdict[*]}
    
    ioarry=`iostat -x | grep sd|awk '{print "datadir=${"$1"}@r="$4",w="$5",await="$10",svctm="$11",util="$12}'`
    for i in ${ioarry[@]};do
      eval temp="${i}"
      #Replace the special character @, and the space in the shell will be truncated to two elements
      temp=${temp/@/ }
      echo "exec,${temp}"
      #Ensure that the final output is in the following format. The first character is the measurement name. If the input.exec plug-in has the configuration name "suffix", the suffix will be added automatically
      #The output format is measurement name, comma, tag keys (comma separated), space, filed keys (comma separated)
      #The data format output mismatch will lead to the failure of telegraf to parse the data and go to the influxdb. It took a long time to debug and didn't look at the hole dug by the official website 
      #exec,datadir=/data/data11 r=4.1,w=6.1,await=0.83,svctm=1.35,util=1.46" 
    done 
    #echo ${devdict[@]}
    [[inputs.exec]]
      ##Commands array
      commands = ["bash /appcom/telegraf/collect_iostat.sh",]
      timeout='5s'
      ##Suffix for measurements
      name_suffix="_collectiostat"
      data_format="influx"

    Sample #2

    #!/bin/sh
    hostname=`hostname`
    uptime=`awk '{print $1}' /proc/uptime`
    if uptime |grep -q user ; then
    load1=`uptime | grep -ohe 'up .*' | sed 's/,//g' | awk '{ print $7}'`
    load5=`uptime | grep -ohe 'up .*' | sed 's/,//g' | awk '{ print $8}'`
    load15=`uptime | grep -ohe 'up .*' | sed 's/,//g' | awk '{ print $9}'`
    else
    load1=`uptime | grep -ohe 'up .*' | sed 's/,//g' | awk '{ print $5}'`
    load5=`uptime | grep -ohe 'up .*' | sed 's/,//g' | awk '{ print $6}'`
    load15=`uptime | grep -ohe 'up .*' | sed 's/,//g' | awk '{ print $7}'`
    fi
    echo "uptime,host=$hostname uptime=$uptime,load1=$load1,load5=$load5,load15=$load15"
    [agent]
    interval = "5s"
    round_interval = true
    [[inputs.swap]]
      [inputs.swap.tags]
        metrics_source="telegraf_demo"
    [[inputs.exec]]
      commands = ["/etc/telegraf/uptime.sh"]
      data_format = "influx"
      [inputs.exec.tags]
        metrics_source="telegraf_demo"
    [[outputs.influxdb]]
      url = "https://influxdemo:8086"
      database = "telegraf"

    Sample #3

    #! /bin/bash
    /usr/bin/speedtest --format json | jq '.download.bandwidth = .download.bandwidth / 125000 |  .upload.bandwidth = .upload.bandwidth / 125000'
    [[inputs.exec]]
      commands = [
        "/home/rock64/speedtest.sh"
        ]
      interval = "300s"
      timeout = "60s"

    Sample #4

    [[inputs.exec]]
      commands = ["sh -c 'sysctl -n dev.cpu.0.temperature | tr -d C'"]
      name_override = "cpu_temp"
      timeout = "5s"
      data_format = "value"
      data_type = "float"
      [inputs.exec.tags]
        core = "core0"
    
    [[inputs.exec]]
      commands = ["sh -c 'sysctl -n dev.cpu.1.temperature | tr -d C'"]
      name_override = "cpu_temp"
      timeout = "5s"
      data_format = "value"
      data_type = "float"
      [inputs.exec.tags]
        core = "core1"
    
    [[inputs.exec]]
      commands = ["sh -c 'sysctl -n dev.cpu.2.temperature | tr -d C'"]
      name_override = "cpu_temp"
      timeout = "5s"
      data_format = "value"
      data_type = "float"
      [inputs.exec.tags]
        core = "core2"
    
    [[inputs.exec]]
      commands = ["sh -c 'sysctl -n dev.cpu.3.temperature | tr -d C'"]
      name_override = "cpu_temp"
      timeout = "5s"
      data_format = "value"
      data_type = "float"
      [inputs.exec.tags]
        core = "core3"

    Q & A

    [agent] Error terminating process: operation not permitted

    Causation: 在 telegraf.conf 設定裡,有個 agent 排程啟動時,因為 timeout 設定時間已到,而 agent 還未完成工作,telegraf 嘗試終止該 agent 失敗。

    Solution: 解決方法一:如果無所謂終止 agent 失敗的行為,可以將 timeout 時間調大,就可以避免或降低錯誤的發生。

    解決方法二:如果想利用 timeout 的設定來避免 agent 可能因為某些異常造成大量程序累積,進而影響系統的運作。

    分析 telegraf 無法終止 agent 的原因,排除異常後,在依需要調整 timeout 時間。

    以筆者案例,agent 使用 sudo 指令收集 db2 的效能指標,指令如下

    [[inputs.exec]]
        interval = "1h"
        commands = ["sudo -u db2mon sh -c '/home/db2mon/bin/collect_db2x1h.sh -d centdb -a b_centdb'"]
        timeout = "5s"
        data_format = "influx"
    

    由於 telegraf 無法 kill 用 sudo 執行的其他帳號下的程序,解決方法是修改指令 collect_db2x1h.sh,可以讓 telegraf 不用 sudo 就可以執行。 

    [[inputs.exec]]
        interval = "1h"
        commands = ["/home/db2mon/bin/collect_db2x1h.sh -d centdb -a b_centdb"]
        timeout = "15s"
        data_format = "influx"

    驗證一下,timeout 時間到達能否成功終止 agent,如果有,會顯示下方訊息:

    [inputs.exec] Error in plugin: exec: command timed out for command '/home/db2mon/bin/collect_db2x1h.sh -d centdb -a b_centdb'

    沒問題後,再調整合適的 timeout。

    Error in plugin: metric parse error: expected tag at 7:20:

    Causation: 輸出的 Influxdata 資料格式不正確

    Solution: 檢查第 7 筆的第 20 個字元。Influxdada 格式為

    measurement, tag-key1=tag-value1,tag-key2=tag-value2 field-key1=field-value1,field-key2=field-value2,....

    • tag-key type: string
    • tag-value type: string
      NOTE: 雙引號不是必要的
    • field-key type: string
    • field-value type: Float | Integer | UInteger | String | Boolean
      NOTE: 如果是 string 必須用雙引號
    max-series-per-database limit exceeded: (1000000)

    Causation: 寫入的資料庫已經達到設定的上限總筆數 1000000。

    在 InfluxDB CLI 執行這段,檢查目前資料庫的筆數

    show series cardinality on <db-name>

    Solution: 調整 InfluxDB 主機上的設定,編輯 /etc/influxdb/influxdb.conf 預設是 1000000

    # max-series-per-database = 1000000
    max-series-per-database = 2000000

    重啟 InfluxDB

    systemctl restart influxdb