vSphere Monitoring

With Telegraf + InfluxDB

Install Telegraf

Download: https://portal.influxdata.com/downloads/

yum localinstall telegraf-1.18.3-1.x86_64.rpm

Configure Telegraf

Create a configuration file

telegraf config > /etc/telegraf/telegraf-vmware.conf

vi /etc/telegraf/telegraf-vmware.conf

Log file

...
[agent]
...
  logfile = "/var/log/telegraf/telegraf-vmware.log"
...
  ## If set to true, do no set the "host" tag in the telegraf agent.
  omit_hostname = true

Output for InfluxDB 1.x

# Configuration for sending metrics to InfluxDB 1.x
[[outputs.influxdb]]
    urls = ["http://10.10.2.209:8086"]
    database = "vmware"
    timeout = "0s"
    username = "admin"
    password = "dba4mis"
    retention_policy = "200d"

Output for InfluxDB 2.x

[[outputs.influxdb_v2]]
  ## The URLs of the InfluxDB cluster nodes.
  ##
  ## Multiple URLs can be specified for a single cluster, only ONE of the
  ## urls will be written to each interval.
  ##   ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"]
  urls = ["http://127.0.0.1:8086"]

  ## Token for authentication.
  token = "Your-Token"

  ## Organization is the name of the organization you wish to write to.
  organization = "Your-Org-Name"

  ## Destination bucket to write into.
  bucket = "Tour-Bucket-Name"
  
  ## Timeout for HTTP messages.
  timeout = "5s"

Input

###############################################################################
#                            INPUT PLUGINS                                    #
###############################################################################


# Read metrics from one or many vCenters
[[inputs.vsphere]]
  interval = "20s"

  ## List of vCenter URLs to be monitored. These three lines must be uncommented
  ## and edited for the plugin to work.
  vcenters = [ "https://vcenter-server-ip/sdk" ]
  username = "admin@vsphere.local"
  password = "ThisPassword"


  vm_metric_include = []
  host_metric_include = []
  cluster_metric_exclude = ["*"]
  datastore_metric_exclude = ["*"]

  max_query_metrics = 256
  timeout = "60s"
  insecure_skip_verify = true

## Historical instance
[[inputs.vsphere]]
 interval = "300s"

  vcenters = [ "https://vcenter-server-ip/sdk" ]
  username = "admin@vsphere.local"
  password = "ThisPassword"

  datastore_metric_include = [ "disk.capacity.latest", "disk.used.latest", "disk.provisioned.latest"]
  insecure_skip_verify = true
  force_discover_on_init = true
  cluster_metric_include = ["*"]
  datacenter_metric_include = ["*"]
  host_metric_exclude = ["*"] # Exclude realtime metrics
  vm_metric_exclude = ["*"] # Exclude realtime metrics

  max_query_metrics = 256
  collect_concurrency = 3

參考範例: Telegraf: VMware vSphere Input Plugin

# Read metrics from VMware vCenter
 [[inputs.vsphere]]
 ## List of vCenter URLs to be monitored. These three lines must be uncommented
 ## and edited for the plugin to work.
 vcenters = [ "https://10.10.1.2/sdk" ]
    username = "administrator@vsphere.local"
    password = "AdminPassword"
 #
 ## VMs
 ## Typical VM metrics (if omitted or empty, all metrics are collected)
 vm_metric_include = [
      "cpu.demand.average",
      "cpu.idle.summation",
      "cpu.latency.average",
      "cpu.readiness.average",
      "cpu.ready.summation",
      "cpu.run.summation",
      "cpu.usagemhz.average",
      "cpu.used.summation",
      "cpu.wait.summation",
      "mem.active.average",
      "mem.granted.average",
      "mem.latency.average",
      "mem.swapin.average",
      "mem.swapinRate.average",
      "mem.swapout.average",
      "mem.swapoutRate.average",
      "mem.usage.average",
      "mem.vmmemctl.average",
      "net.bytesRx.average",
      "net.bytesTx.average",
      "net.droppedRx.summation",
      "net.droppedTx.summation",
      "net.usage.average",
      "power.power.average",
      "virtualDisk.numberReadAveraged.average",
      "virtualDisk.numberWriteAveraged.average",
      "virtualDisk.read.average",
      "virtualDisk.readOIO.latest",
      "virtualDisk.throughput.usage.average",
      "virtualDisk.totalReadLatency.average",
      "virtualDisk.totalWriteLatency.average",
      "virtualDisk.write.average",
      "virtualDisk.writeOIO.latest",
      "sys.uptime.latest",
    ]
 # vm_metric_exclude = [] ## Nothing is excluded by default
 # vm_instances = true ## true by default
 #
 ## Hosts
 ## Typical host metrics (if omitted or empty, all metrics are collected)
 host_metric_include = [
      "cpu.coreUtilization.average",
      "cpu.costop.summation",
      "cpu.demand.average",
      "cpu.idle.summation",
      "cpu.latency.average",
      "cpu.readiness.average",
      "cpu.ready.summation",
      "cpu.swapwait.summation",
      "cpu.usage.average",
      "cpu.usagemhz.average",
      "cpu.used.summation",
      "cpu.utilization.average",
      "cpu.wait.summation",
      "disk.deviceReadLatency.average",
      "disk.deviceWriteLatency.average",
      "disk.kernelReadLatency.average",
      "disk.kernelWriteLatency.average",
      "disk.numberReadAveraged.average",
      "disk.numberWriteAveraged.average",
      "disk.read.average",
      "disk.totalReadLatency.average",
      "disk.totalWriteLatency.average",
      "disk.write.average",
      "mem.active.average",
      "mem.latency.average",
      "mem.state.latest",
      "mem.swapin.average",
      "mem.swapinRate.average",
      "mem.swapout.average",
      "mem.swapoutRate.average",
      "mem.totalCapacity.average",
      "mem.usage.average",
      "mem.vmmemctl.average",
      "net.bytesRx.average",
      "net.bytesTx.average",
      "net.droppedRx.summation",
      "net.droppedTx.summation",
      "net.errorsRx.summation",
      "net.errorsTx.summation",
      "net.usage.average",
      "power.power.average",
      "storageAdapter.numberReadAveraged.average",
      "storageAdapter.numberWriteAveraged.average",
      "storageAdapter.read.average",
      "storageAdapter.write.average",
      "sys.uptime.latest",
    ]
 # host_metric_exclude = [] ## Nothing excluded by default
 # host_instances = true ## true by default
 #
 ## Clusters
 cluster_metric_include = [] ## if omitted or empty, all metrics are collected
 # cluster_metric_exclude = [] ## Nothing excluded by default
 # cluster_instances = false ## false by default
 #
 ## Datastores
 datastore_metric_include = [] ## if omitted or empty, all metrics are collected
 # datastore_metric_exclude = [] ## Nothing excluded by default
 # datastore_instances = false ## false by default for Datastores only
 #
 ## Datacenters
 datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
 # datacenter_instances = false ## false by default for Datastores only
 #
 ## Plugin Settings
 ## separator character to use for measurement and field names (default: "_")
 # separator = "_"
 #
 ## number of objects to retreive per query for realtime resources (vms and hosts)
 ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
 # max_query_objects = 256
 #
 ## number of metrics to retreive per query for non-realtime resources (clusters and datastores)
 ## set to 64 for vCenter 5.5 and 6.0 (default: 256)
 # max_query_metrics = 256
 #
 ## number of go routines to use for collection and discovery of objects and metrics
 # collect_concurrency = 1
 # discover_concurrency = 1
 #
 ## whether or not to force discovery of new objects on initial gather call before collecting metrics
 ## when true for large environments this may cause errors for time elapsed while collecting metrics
 ## when false (default) the first collection cycle may result in no or limited metrics while objects are discovered
 # force_discover_on_init = false
 #
 ## the interval before (re)discovering objects subject to metrics collection (default: 300s)
 # object_discovery_interval = "300s"
 #
 ## timeout applies to any of the api request made to vcenter
 # timeout = "60s"
 #
 ## Optional SSL Config
 # ssl_ca = "/path/to/cafile"
 # ssl_cert = "/path/to/certfile"
 # ssl_key = "/path/to/keyfile"
 ## Use SSL but skip chain & host verification
 insecure_skip_verify = true

Configure systemd

cp /usr/lib/systemd/system/telegraf.service /usr/lib/systemd/system/telegraf-vmware.service
sed -i 's/telegraf.conf/telegraf-vmware.conf/g' /usr/lib/systemd/system/telegraf-vmware.service

Startup Telegraf

systemctl daemon-reload
systemctl start telegraf-vmware
systemctl enable telegraf-vmware

Configure InfluxDB

Set the retention policy

[root@mm-mon ~]# influx -username admin -password dba4mis
Connected to http://localhost:8086 version 1.8.5
InfluxDB shell version: 1.8.5
> show retention policies on vmware
name    duration shardGroupDuration replicaN default
----    -------- ------------------ -------- -------
autogen 0s       168h0m0s           1        true
> alter retention policy "autogen" on "vmware" duration 200d shard duration 1d
> show retention policies on vmware
name    duration  shardGroupDuration replicaN default
----    --------  ------------------ -------- -------
autogen 4800h0m0s 24h0m0s            1        true

Configure Grafana

Add a datasource for InfluxDB
- Name: VMware
- Type: InfluxDB
- Database: vmware
- Username: <InfluxDB Credential>
- Password: <InfluxDB Credential>
Import the dashboards

FAQ

Q: 之後新增的 VM 不會出現在 Dashoboard。

A: 先確認 InfluxDB 是否已寫入新 VM 的 data，如果有，只要更新 Dashboard Settings > Variables > virtualmachine > 執行 Update，檢查 Preview of values 是否有出現新 VM name。

檢查 InfluxDB

# Check all current VM names
select DISTINCT("vmname") from (select "ready_summation","vmname" from "vsphere_vm_cpu" WHERE time > now() - 10m)

Q: Telegraf 錯誤訊息

[inputs.vsphere] Error in plugin: while collecting vm: ServerFaultCode: A specified parameter was not correct: querySpec[0].endTime

A: 如果是第一次啟動，等待第3次一樣的錯誤後，就自動不再發生。

With SexiGraf

Official: http://www.sexigraf.fr/quickstart/
OS-based: Ubuntu 16.04.6 LTS

Download the OVA appliance

vCenter/vSphere Credential for monitor only

vCenter Web Client > 功能表 > 系統管理 > Single Sign On: 使用者與群組 > 新增

使用者名稱: winmon
密碼: xxxx
確認密碼: xxxx

vCenter Web Client > 功能表 > 主機與叢集 > 權限 > 新增權限

使用者: vsphere.local , 搜尋 winmon
角色: 唯讀
散佈到子係: 勾選

Deploy the OVA to vCenter/ESXi

部署到 ESXi 6.5 時失敗，錯誤訊息

Line 163: Unable to parse 'tools.syncTime' for attribute 'key' on element 'Config'.

解決方法: 使用 OVF-Tool 先解開 OVA 檔，編輯 OVF 檔的內容

# Before
<vmw:Config ovf:required="true"  vmw:key="tools.syncTime" vmw:value="true"/>

# After
<vmw:Config ovf:required="false"  vmw:key="tools.syncTime" vmw:value="true"/>

存檔後，重新再部署一次。

First to Start the VM

1. SSH Credential: root / Sex!Gr@f

2. Need to manually configure the IP, Edit the /etc/network/interfaces .

3. Configure the hostname

hostnamectl set-hostname esx-mon

4. Configure the timezone and time server

timedatectl set-timezone Asia/Taipei

vi /etc/ntp.conf

#pool 0.ubuntu.pool.ntp.org iburst
#pool 1.ubuntu.pool.ntp.org iburst
#pool 2.ubuntu.pool.ntp.org iburst
#pool 3.ubuntu.pool.ntp.org iburst

# Use Ubuntu's ntp server as a fallback.
#pool ntp.ubuntu.com

# Added the local time server
server 192.168.21.86 prefer iburst

Restart the ntpd

systemctl stop ntp
systemctl start ntp

# Check the timeserver
ntpq -p