vSphere Monitoring
With Telegraf + InfluxDB
Install Telegraf
Download: https://portal.influxdata.com/downloads/
yum localinstall telegraf-1.18.3-1.x86_64.rpm
Configure Telegraf
Create a configuration file
telegraf config > /etc/telegraf/telegraf-vmware.conf
vi /etc/telegraf/telegraf-vmware.conf
Output for InfluxDB 1.x
# Configuration for sending metrics to InfluxDB 1.x
[[outputs.influxdb]]
urls = ["http://10.10.2.209:8086"]
database = "vmware"
timeout = "0s"
username = "admin"
password = "dba4mis"
retention_policy = "200d"
Output for InfluxDB 2.x
[[outputs.influxdb_v2]]
## The URLs of the InfluxDB cluster nodes.
##
## Multiple URLs can be specified for a single cluster, only ONE of the
## urls will be written to each interval.
## ex: urls = ["https://us-west-2-1.aws.cloud2.influxdata.com"]
urls = ["http://127.0.0.1:8086"]
## Token for authentication.
token = "Your-Token"
## Organization is the name of the organization you wish to write to.
organization = "Your-Org-Name"
## Destination bucket to write into.
bucket = "Tour-Bucket-Name"
Input
[agent]
...
logfile = "/var/log/telegraf/telegraf.log"
...
# Read metrics from VMware vCenter
[[inputs.vsphere]]
## List of vCenter URLs to be monitored. These three lines must be uncommented
## and edited for the plugin to work.
vcenters = [ "https://10.10.1.2/sdk" ]
username = "administrator@vsphere.local"
password = "AdminPassword"
#
## VMs
## Typical VM metrics (if omitted or empty, all metrics are collected)
vm_metric_include = [
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.run.summation",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.wait.summation",
"mem.active.average",
"mem.granted.average",
"mem.latency.average",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.usage.average",
"power.power.average",
"virtualDisk.numberReadAveraged.average",
"virtualDisk.numberWriteAveraged.average",
"virtualDisk.read.average",
"virtualDisk.readOIO.latest",
"virtualDisk.throughput.usage.average",
"virtualDisk.totalReadLatency.average",
"virtualDisk.totalWriteLatency.average",
"virtualDisk.write.average",
"virtualDisk.writeOIO.latest",
"sys.uptime.latest",
]
# vm_metric_exclude = [] ## Nothing is excluded by default
# vm_instances = true ## true by default
#
## Hosts
## Typical host metrics (if omitted or empty, all metrics are collected)
host_metric_include = [
"cpu.coreUtilization.average",
"cpu.costop.summation",
"cpu.demand.average",
"cpu.idle.summation",
"cpu.latency.average",
"cpu.readiness.average",
"cpu.ready.summation",
"cpu.swapwait.summation",
"cpu.usage.average",
"cpu.usagemhz.average",
"cpu.used.summation",
"cpu.utilization.average",
"cpu.wait.summation",
"disk.deviceReadLatency.average",
"disk.deviceWriteLatency.average",
"disk.kernelReadLatency.average",
"disk.kernelWriteLatency.average",
"disk.numberReadAveraged.average",
"disk.numberWriteAveraged.average",
"disk.read.average",
"disk.totalReadLatency.average",
"disk.totalWriteLatency.average",
"disk.write.average",
"mem.active.average",
"mem.latency.average",
"mem.state.latest",
"mem.swapin.average",
"mem.swapinRate.average",
"mem.swapout.average",
"mem.swapoutRate.average",
"mem.totalCapacity.average",
"mem.usage.average",
"mem.vmmemctl.average",
"net.bytesRx.average",
"net.bytesTx.average",
"net.droppedRx.summation",
"net.droppedTx.summation",
"net.errorsRx.summation",
"net.errorsTx.summation",
"net.usage.average",
"power.power.average",
"storageAdapter.numberReadAveraged.average",
"storageAdapter.numberWriteAveraged.average",
"storageAdapter.read.average",
"storageAdapter.write.average",
"sys.uptime.latest",
]
# host_metric_exclude = [] ## Nothing excluded by default
# host_instances = true ## true by default
#
## Clusters
cluster_metric_include = [] ## if omitted or empty, all metrics are collected
# cluster_metric_exclude = [] ## Nothing excluded by default
# cluster_instances = false ## false by default
#
## Datastores
datastore_metric_include = [] ## if omitted or empty, all metrics are collected
# datastore_metric_exclude = [] ## Nothing excluded by default
# datastore_instances = false ## false by default for Datastores only
#
## Datacenters
datacenter_metric_include = [] ## if omitted or empty, all metrics are collected
# datacenter_metric_exclude = [ "*" ] ## Datacenters are not collected by default.
# datacenter_instances = false ## false by default for Datastores only
#
## Plugin Settings
## separator character to use for measurement and field names (default: "_")
# separator = "_"
#
## number of objects to retreive per query for realtime resources (vms and hosts)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_objects = 256
#
## number of metrics to retreive per query for non-realtime resources (clusters and datastores)
## set to 64 for vCenter 5.5 and 6.0 (default: 256)
# max_query_metrics = 256
#
## number of go routines to use for collection and discovery of objects and metrics
# collect_concurrency = 1
# discover_concurrency = 1
#
## whether or not to force discovery of new objects on initial gather call before collecting metrics
## when true for large environments this may cause errors for time elapsed while collecting metrics
## when false (default) the first collection cycle may result in no or limited metrics while objects are discovered
# force_discover_on_init = false
#
## the interval before (re)discovering objects subject to metrics collection (default: 300s)
# object_discovery_interval = "300s"
#
## timeout applies to any of the api request made to vcenter
# timeout = "60s"
#
## Optional SSL Config
# ssl_ca = "/path/to/cafile"
# ssl_cert = "/path/to/certfile"
# ssl_key = "/path/to/keyfile"
## Use SSL but skip chain & host verification
insecure_skip_verify = true
Startup Telegraf
systemctl start telegraf
systemctl enable telegraf
Configure InfluxDB
Set the retention policy
[root@mm-mon ~]# influx -username admin -password dba4mis
Connected to http://localhost:8086 version 1.8.5
InfluxDB shell version: 1.8.5
> show retention policies on vmware
name duration shardGroupDuration replicaN default
---- -------- ------------------ -------- -------
autogen 0s 168h0m0s 1 true
> alter retention policy "autogen" on "vmware" duration 200d shard duration 1d
> show retention policies on vmware
name duration shardGroupDuration replicaN default
---- -------- ------------------ -------- -------
autogen 4800h0m0s 24h0m0s 1 true
Configure Grafana
- Add a datasource for InfluxDB
- Name: VMware
- Type: InfluxDB
- Database: vmware
- Username: <InfluxDB Credential>
- Password: <InfluxDB Credential>
- Import the dashboards
FAQ
Q: 之後新增的 VM 不會出現在 Dashoboard。
A: 先確認 InfluxDB 是否已寫入新 VM 的 data,如果有,只要更新 Dashboard Settings > Variables > virtualmachine > 執行 Update,檢查 Preview of values 是否有出現新 VM name。
檢查 InfluxDB
# Check all current VM names
select DISTINCT("vmname") from (select "ready_summation","vmname" from "vsphere_vm_cpu" WHERE time > now() - 10m)
With SexiGraf
- Official: http://www.sexigraf.fr/quickstart/
- OS-based: Ubuntu 16.04.6 LTS
Download the OVA appliance
vCenter/vSphere Credential for monitor only
vCenter Web Client > 功能表 > 系統管理 > Single Sign On: 使用者與群組 > 新增
- 使用者名稱: winmon
- 密碼: xxxx
- 確認密碼: xxxx
vCenter Web Client > 功能表 > 主機與叢集 > 權限 > 新增權限
- 使用者: vsphere.local , 搜尋 winmon
- 角色: 唯讀
- 散佈到子係: 勾選
Deploy the OVA to vCenter/ESXi
部署到 ESXi 6.5 時失敗,錯誤訊息
Line 163: Unable to parse 'tools.syncTime' for attribute 'key' on element 'Config'.
解決方法: 使用 OVF-Tool 先解開 OVA 檔,編輯 OVF 檔的內容
# Before
<vmw:Config ovf:required="true" vmw:key="tools.syncTime" vmw:value="true"/>
# After
<vmw:Config ovf:required="false" vmw:key="tools.syncTime" vmw:value="true"/>
存檔後,重新再部署一次。
First to Start the VM
1. SSH Credential: root / Sex!Gr@f
2. Need to manually configure the IP, Edit the /etc/network/interfaces
.
3. Configure the hostname
hostnamectl set-hostname esx-mon
4. Configure the timezone and time server
timedatectl set-timezone Asia/Taipei
vi /etc/ntp.conf
#pool 0.ubuntu.pool.ntp.org iburst
#pool 1.ubuntu.pool.ntp.org iburst
#pool 2.ubuntu.pool.ntp.org iburst
#pool 3.ubuntu.pool.ntp.org iburst
# Use Ubuntu's ntp server as a fallback.
#pool ntp.ubuntu.com
# Added the local time server
server 192.168.21.86 prefer iburst
Restart the ntpd
systemctl stop ntp
systemctl start ntp
# Check the timeserver
ntpq -p
First to Login the Grafana Web
- Login: admin / Sex!Gr@f
- Add the credential to connect to the vCenter server managed: Search > SexiGraf > SexiGraf Web Admin > Credential Store
- vCenter IP: <vCenter/ESXi IP or FQDN>
- Username: <Username to login to vCenter/ESXi>
- Password: <Password to login to vCenter/ESXi>