Skip to main content

vSphere Monitoring

With Telegraf + InfluxDB

    VMware vSphere - Overview | Grafana Labs
    Install Telegraf

    Download: https://portal.influxdata.com/downloads/

    yum localinstall telegraf-1.18.3-1.x86_64.rpm
    Configure Telegraf

    vi /etc/telegraf/telegraf.conf

    [agent]
    ...
        logfile = "/var/log/telegraf/telegraf.log"
    ...
    
    # Configuration for sending metrics to InfluxDB
    [[outputs.influxdb]]
        urls = ["http://10.10.2.209:8086"]
        database = "vmware"
        timeout = "0s"
        username = "admin"
        password = "dba4mis"
        retention_policy = "200d"
    
    
    # Read metrics from VMware vCenter
     [[inputs.vsphere]]
     ## List of vCenter URLs to be monitored. These three lines must be uncommented
     ## and edited for the plugin to work.
     vcenters = [ "https://10.10.1.2/sdk" ]
        username = "administrator@vsphere.local"
        password = "AdminPassword"
     #
     ## VMs
     ## Typical VM metrics (if omitted or empty, all metrics are collected)
     vm_metric_include = [
          "cpu.demand.average",
          "cpu.idle.summation",
          "cpu.latency.average",
          "cpu.readiness.average",
          "cpu.ready.summation",
          "cpu.run.summation",
          "cpu.usagemhz.average",
          "cpu.used.summation",
          "cpu.wait.summation",
          "mem.active.average",
          "mem.granted.average",
          "mem.latency.average",
          "mem.swapin.average",
          "mem.swapinRate.average",
          "mem.swapout.average",
          "mem.swapoutRate.average",
          "mem.usage.average",
          "mem.vmmemctl.average",
          "net.bytesRx.average",
          "net.bytesTx.average",
          "net.droppedRx.summation",
          "net.droppedTx.summation",
          "net.usage.average",
          "power.power.average",
          "virtualDisk.numberReadAveraged.average",
          "virtualDisk.numberWriteAveraged.average",
          "virtualDisk.read.average",
          "virtualDisk.readOIO.latest",
          "virtualDisk.throughput.usage.average",
          "virtualDisk.totalReadLatency.average",
          "virtualDisk.totalWriteLatency.average",
          "virtualDisk.write.average",
          "virtualDisk.writeOIO.latest",
          "sys.uptime.latest",
        ]
     # vm_metric_exclude = [] ## Nothing is excluded by default
     # vm_instances = true ## true by default
     #
     ## Hosts
     ## Typical host metrics (if omitted or empty, all metrics are collected)
     host_metric_include = [
          "cpu.coreUtilization.average",
          "cpu.costop.summation",
          "cpu.demand.average",
          "cpu.idle.summation",
          "cpu.latency.average",
          "cpu.readiness.average",
          "cpu.ready.summation",
          "cpu.swapwait.summation",
          "cpu.usage.average",
          "cpu.usagemhz.average",
          "cpu.used.summation",
          "cpu.utilization.average",
          "cpu.wait.summation",
          "disk.deviceReadLatency.average",
          "disk.deviceWriteLatency.average",
          "disk.kernelReadLatency.average",
          "disk.kernelWriteLatency.average",
          "disk.numberReadAveraged.average",
          "disk.numberWriteAveraged.average",
          "disk.read.average",
          "disk.totalReadLatency.average",
          "disk.totalWriteLatency.average",
          "disk.write.average",
          "mem.active.average",
          "mem.latency.average",
          "mem.state.latest",
          "mem.swapin.average",
          "mem.swapinRate.average",
          "mem.swapout.average",
          "mem.swapoutRate.average",
          "mem.totalCapacity.average",
          "mem.usage.average",
          "mem.vmmemctl.average",
          "net.bytesRx.average",
          "net.bytesTx.average",
          "net.droppedRx.summation",
          "net.droppedTx.summation",
          "net.errorsRx.summation",
          "net.errorsTx.summation",
          "net.usage.average",
          "power.power.average",
          "storageAdapter.numberReadAveraged.average",
          "storageAdapter.numberWriteAveraged.average",
          "storageAdapter.read.average",
          "storageAdapter.write.average",
          "sys.uptime.latest",
        ]
     # host_metric_exclude = [] ## Nothing excluded by default
     # host_instances = true ## true by default
     #
     ## Clusters
     cluster_metric_include = [] ## if omitted or empty, all metrics are collected
     # cluster_metric_exclude = [] ## Nothing excluded by default
     # cluster_instances = false ## false by default
     #
     ## Datastores
     datastore_metric_include = [] ## if omitted or empty, all metrics are collected
     # datastore_metric_exclude = [] ## Nothing excluded by default
     # datastore_instances = false ## false by default for Datastores only
     #
     ## Datacenters
     datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
    # datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
     # datacenter_instances = false ## false by default for Datastores only
     #
     ## Plugin Settings
     ## separator character to use for measurement and field names (default: "_")
     # separator = "_"
     #
     ## number of objects to retreive per query for realtime resources (vms and hosts)
     ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
     # max_query_objects = 256
     #
     ## number of metrics to retreive per query for non-realtime resources (clusters and datastores)
     ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
     # max_query_metrics = 256
     #
     ## number of go routines to use for collection and discovery of objects and metrics
     # collect_concurrency = 1
     # discover_concurrency = 1
     #
     ## whether or not to force discovery of new objects on initial gather call before collecting metrics
     ## when true for large environments this may cause errors for time elapsed while collecting metrics
     ## when false (default) the first collection cycle may result in no or limited metrics while objects are discovered
     # force_discover_on_init = false
     #
     ## the interval before (re)discovering objects subject to metrics collection (default: 300s)
     # object_discovery_interval = "300s"
     #
     ## timeout applies to any of the api request made to vcenter
     # timeout = "60s"
     #
     ## Optional SSL Config
     # ssl_ca = "/path/to/cafile"
     # ssl_cert = "/path/to/certfile"
     # ssl_key = "/path/to/keyfile"
     ## Use SSL but skip chain & host verification
     insecure_skip_verify = true

    Startup Telegraf

    systemctl start telegraf
    systemctl enable telegraf
    Configure InfluxDB

    Set the retention policy

    [root@mm-mon ~]# influx -username admin -password dba4mis
    Connected to http://localhost:8086 version 1.8.5
    InfluxDB shell version: 1.8.5
    > show retention policies on vmware
    name    duration shardGroupDuration replicaN default
    ----    -------- ------------------ -------- -------
    autogen 0s       168h0m0s           1        true
    > alter retention policy "autogen" on "vmware" duration 200d shard duration 1d
    > show retention policies on vmware
    name    duration  shardGroupDuration replicaN default
    ----    --------  ------------------ -------- -------
    autogen 4800h0m0s 24h0m0s            1        true
    Configure Grafana
    1. Add a datasource for InfluxDB
      • Name: VMware
      • Type: InfluxDB
      • Database: vmware
      • Username: <InfluxDB Credential>
      • Password: <InfluxDB Credential>
    2. Import the dashboards
      1. https://grafana.com/grafana/dashboards/8159
      2. https://grafana.com/grafana/dashboards/8165
      3. https://grafana.com/grafana/dashboards/8168
      4. https://grafana.com/grafana/dashboards/8162
    FAQ

    Q: 之後新增的 VM 不會出現在 Dashoboard。

    A: 先確認 InfluxDB 是否已寫入新 VM 的 data,如果有,只要更新 Dashboard Settings > Variables > virtualmachine > 執行 Update,檢查 Preview of values 是否有出現新 VM name。

    檢查 InfluxDB

    # Check all current VM names
    select DISTINCT("vmname") from (select "ready_summation","vmname" from "vsphere_vm_cpu" WHERE time > now() - 10m)

    With SexiGraf

    Download the OVA appliance
    vCenter/vSphere Credential for monitor only

    vCenter Web Client > 功能表 > 系統管理 > Single Sign On: 使用者與群組 > 新增

    • 使用者名稱: winmon
    • 密碼: xxxx
    • 確認密碼: xxxx

    vCenter Web Client > 功能表 > 主機與叢集 > 權限 > 新增權限

    • 使用者: vsphere.local , 搜尋 winmon
    • 角色: 唯讀
    • 散佈到子係: 勾選
    Deploy the OVA to vCenter/ESXi

    部署到 ESXi 6.5 時失敗,錯誤訊息

    Line 163: Unable to parse 'tools.syncTime' for attribute 'key' on element 'Config'.

    解決方法: 使用 OVF-Tool 先解開 OVA 檔,編輯 OVF 檔的內容

    # Before
    <vmw:Config ovf:required="true"  vmw:key="tools.syncTime" vmw:value="true"/>
    
    # After
    <vmw:Config ovf:required="false"  vmw:key="tools.syncTime" vmw:value="true"/>

    存檔後,重新再部署一次。


    First to Start the VM

    1. SSH Credential: root / Sex!Gr@f

    2. Need to manually configure the IP, Edit the /etc/network/interfaces .

    3. Configure the hostname

    hostnamectl set-hostname esx-mon

    4. Configure the timezone and time server

    timedatectl set-timezone Asia/Taipei

    vi /etc/ntp.conf

    #pool 0.ubuntu.pool.ntp.org iburst
    #pool 1.ubuntu.pool.ntp.org iburst
    #pool 2.ubuntu.pool.ntp.org iburst
    #pool 3.ubuntu.pool.ntp.org iburst
    
    # Use Ubuntu's ntp server as a fallback.
    #pool ntp.ubuntu.com
    
    # Added the local time server
    server 192.168.21.86 prefer iburst

    Restart the ntpd

    systemctl stop ntp
    systemctl start ntp
    
    # Check the timeserver
    ntpq -p


    First to Login the Grafana Web
    1. Login: admin / Sex!Gr@f
    2. Add the credential to connect to the vCenter server managed: Search > SexiGraf > SexiGraf Web Admin > Credential Store
      • vCenter IP: <vCenter/ESXi IP or FQDN>
      • Username: <Username to login to vCenter/ESXi>
      • Password: <Password to login to vCenter/ESXi>