Previously we didn't monitor this on a regular basis but after moving a lot of data the cluster basically crashed. The pool metadata use got to 98% and as far as our ESX hosts were concerned the disks were all full even though there was terabytes of space available. Our servers stopped in their tracks until Tegile got the meta cleaned up which took only minutes to do but had us down for a few hours.
With ZFS it seems that this metadata is the equivalent of the file allocation table on NTFS or FAT. I'm no Solaris expert but these things are making me learn a lot, and fast.
To avoid this in the future I retooled my replication report to detail the pool stats. I plan to run this as a scheduled task three times a week. The script is below, perhaps it can help someone else to keep their system healthy.
Make sure to create the config file and place it in the folder with the script. You'll need PowerShell v5 and the PoshSSH module for this to work.
A minor update... Added separate triggers for pool and meta use.
Another updaye... Fixed error related to leading zero on meta used.
<#==============================================================================
File Name : ZebiDiskHealthReport.ps1
Original Author : Kenneth C. Mazie (kcmjr @ kcmjr.com)
:
Description : This script will query
multiple Tegile Zebi Controllers,
: parse the output from
"zpool list", and email a report.
:
Arguments : Named commandline
parameters: (all are optional)
:
"-console" - Displays console output during run.
: "-debug" - Switches email
recipient.
:
Notes : Settings are loaded from
an XML file located in the script folder.
: See the end of the script for config file
example.
:
Requirements
: Requires PS v5. Requires Posh-SSH
module.
:
Warnings : Make absolutely sure the
proper user is in the confix file AND on each Zebi
: as well as is listed in the sshd_config file
on the zebi.
:
Legal : Public Domain. Modify and
redistribute freely. No rights reserved.
: SCRIPT PROVIDED "AS
IS" WITHOUT WARRANTIES OR GUARANTEES OF
: ANY KIND. USE AT YOUR OWN RISK. NO TECHNICAL
SUPPORT PROVIDED.
:
Credits : Code snippets and/or ideas
came from many sources around the web.
:
Last Update by : Kenneth C. Mazie (email
kcmjr AT kcmjr.com for comments or to report bugs)
Version History : v1.0 - 02-03-16 - Original
Change History : v1.1 - 02-14-17 - Retooled
from Zebi replication report script.
: v1.2 - 02-24-17 – Added separate triggers
for pool and meta use.
: v1.3 - 03-02-17 - Fixed error caused by meta free less than 10 without leading zero.
: v1.3 - 03-02-17 - Fixed error caused by meta free less than 10 without leading zero.
:
#===============================================================================#>
#requires -version 5.0
Param (
[bool]$Debug = $false,
[bool]$Console = $false
)
clear-host
#--[ Store all the start up
variables so you can clean up when the script finishes. ]--
if ($startupvariables) { try {Remove-Variable -Name startupvariables -Scope Global -ErrorAction SilentlyContinue } catch { } }
New-Variable -force -name startupVariables -value ( Get-Variable | ForEach-Object { $_.Name } )
If (!(Get-Module Posh-SSH)){Import-Module "Posh-SSH" -ErrorAction SilentlyContinue}
If ($Debug){$Script:Debug = $true}
If ($Console){$Script:Console = $true}
$ErrorActionPreference = "silentlycontinue"
$Script:Datetime = Get-Date -Format "MM-dd-yyyy_HH:mm" #--[ Current date $ time ]--
$Script:Today = Get-Date -Format "MM-dd-yyyy" #--[ Current
date ]--
$Script:ThisYear = Get-Date -Format "yyyy" #--[ Current
year ]--
$EpochDiff = New-TimeSpan '01 January
1970 00:00:00' $(Get-Date) #--[ Seconds
since 01-01-1970 ]--
$EpochSecs = [INT] $EpochDiff.TotalSeconds #--[ Rounded ]--
$EpochDays = [INT] (($EpochDiff.TotalSeconds)/86400) #--[ Converted to days ]--
$Script:Span = 11 #--[ HTML col count ]--
$Script:BadMetaPercent = 85 #--[ Meta use
percent after which report turns red ]--
$Script:BadPoolPercent = 80 #--[ Pool use
percent after which report turns red ]--
#--[ Functions
]----------------------------------------------------------------
Function ResetVariables {
Get-Variable | Where-Object { $startupVariables -notcontains $_.Name } | ForEach-Object {
try { Remove-Variable -Name "$($_.Name)" -Force -Scope "global" -ErrorAction SilentlyContinue -WarningAction SilentlyContinue}catch{ }
}
}
Function SendEmail {
If ($Script:Debug){$ErrorActionPreference = "stop"}
$email = New-Object System.Net.Mail.MailMessage
$email.From = $Script:EmailFrom
$email.IsBodyHtml = $Script:EmailHTML
If ($Script:Debug){
$email.To.Add($Script:DebugEmail)
}Else{
$email.To.Add($Script:EmailTo)
}
$email.Subject = $Script:EmailSubject
$email.Body = $Script:ReportBody
$smtp = new-object Net.Mail.SmtpClient($Script:SmtpServer)
$smtp.Send($email)
If ($Script:Console){Write-Host "`nEmail
sent...`n"}
}
#--[ Read and load configuration
file ]-----------------------------------------
If (!(Test-Path "$PSScriptRoot\Configuration.xml")){ #--[ Error out
if configuration file doesn't exist ]--
Write-host "MISSING
CONFIG FILE. Script aborted." -forgroundcolor
red
break
}Else{
[xml]$Script:Configuration = Get-Content "$PSScriptRoot\Configuration.xml" #--[ Load configuration ]--
$Script:DebugEmail = $Script:Configuration.Settings.Email.Debug
$Script:EmailTo = $Script:Configuration.Settings.Email.To
$Script:EmailHTML = $Script:Configuration.Settings.Email.HTML
$Script:EmailSubject = $Script:Configuration.Settings.Email.Subject
$Script:EmailFrom = $Script:Configuration.Settings.Email.From
$Script:SmtpServer = $Script:Configuration.Settings.Email.SmtpServer
$Script:UserName = $Script:Configuration.Settings.Credentials.Username
$Script:Password = $Script:Configuration.Settings.Credentials.Password
[array]$Script:Targets = $Script:Configuration.Settings.General.Targets
$Script:ReportName = $Script:Configuration.Settings.General.ReportName
}
#--[ Add header to html output
file ]--
$Script:ReportBody = @()
$Script:ReportBody += '
<style
type="text/css">
table.myTable
{ border:5px solid black;border-collapse:collapse;}
table.myTable td { border:2px solid
black;padding:5px;white-space:nowrap;}
table.myTable
tr { border:2px solid black;padding:5px;white-space:nowrap;}
table.myTable
th { border:2px solid black;padding:5px;background:#949494;white-space:nowrap;}
table.bottomBorder
{ border-collapse:collapse; }
table.bottomBorder
td, table.bottomBorder th { border-bottom:1px dotted black;padding:5px; }
tr.noBorder
td {border:0}
td.auto
{ border:2px solid black;padding:5px;white-space:nowrap;}
</style>'
$Script:ReportBody +=
'<table
class="myTable">
<tr
class="noBorder"><td colspan='+$Script:Span+'><center><h1>-
' + $Script:ReportName + '
-</h1></td></tr>
<tr
class="noBorder"><td colspan='+$Script:Span+'><center>The
following report displays statistics for the disk pools on all Tegile SAN
controllers.</td></tr>
<tr
class="noBorder"><td colspan='+$Script:Span+'><center>Metadata
is only used for pools that host data, not system pools. Not all controllers host
data.</td></tr>
<tr
class="noBorder"><td colspan='+$Script:Span+'><center>If
Metadata useage exceeds 85% manual cleanup is recommended. Above 95% writes will begin to
fail.</td></tr>
<tr
class="noBorder"><td colspan='+$Script:Span+'><center>If
Pool data useage exceeds 80% cell color will change to indicate manual
inspection is recommended.</td></tr>
<tr
class="noBorder"><td colspan='+$Script:Span+'></tr>
'
Foreach ($Target in $Script:Targets.Target){
if ($Console){Write-Host "`n--[
Processing Target: $Target ]-----------------------------------" -ForegroundColor Yellow }
$Cmd = '/usr/sbin/zpool list' #--[ If you change this command keep the path or
SSH will not record data ]--
Remove-SSHSession -SessionId 0 -ErrorAction SilentlyContinue | Out-Null #--[ Clear out previous session if it
still exists ]--
$SecPassword = ConvertTo-SecureString $Script:Password -AsPlainText -Force
$Creds = New-Object System.Management.Automation.PSCredential ($Script:UserName, $SecPassword)
$SSH = New-SshSession -ComputerName $Target -Credential $Creds -AcceptKey:$true #| Out-Null #--[ Open new SSH session ]--
$Script:Return = $(Invoke-SSHCommand -SSHSession $SSH -Command $Cmd).Output #--[ Invoke SSH command and capture the
output as a string ]--
#If
($Script:Console){$Script:Return} #--[ Display raw output ]--
$lineCount = 0
$color = 10
#--[ Build
target table ]--
$Script:HTMLData = @()
$Script:HTMLData += '<tr
class="myTable"><th>Controller</th><th>Pool</th><th>Pool
Size</th><th>Pool Used</th><th>Pool
Free</th><th>Pool % Used</th><th>Dedup
%</th><th>Meta Size</th><th>Meta
Used</th><th>Meta Free</th><th>Meta %
Used</th></tr>'
$Script:RowData = ""
Foreach ($Line in $Script:Return){
If ($Console){write-host $Line -ForegroundColor $color } #--[ Display
parsed data for debugging ]--
If ($LineCount -ge 1){ #--[ Ignore
Output Header ]--
$Line = $Line -replace "\s+", ","
#--[ HTML Row
Color Settings ]------------------------------------------
$ColorGrey = "#dfdfdf" #--[ Grey
default cell background ]--
$ColorRed = "#ff0000" #--[ Red background for alerts ]--
$ColorOra = "#ff9900" #--[ Orange background for alerts ]--
$ColorYel = "#ffd900" #--[ Yellow background for alerts ]--
$ColorBla = "#000000" #--[ Black
default cell foreground ]--
$Script:RowData += '<tr>' #--[ Start table
row ]--
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Target + '</td>' #--[ Host
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[0] + '</td>' #--[ Pool
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[1] + '</td>' #--[ Size
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[2] + '</td>' #--[ Used
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[3] + '</td>' #--[ Free
If (($Console) -and ($Debug)){write-host 'Pool % Used: '(($Line.Split(",")[4]).Split("%")[0])}
If (($Line.Split(",")[4]).Split("%")[0] -ge $Script:BadPoolPercent){
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorYel + '><font color=' + $ColorRed + '>' + $Line.Split(",")[4] + '</td>' #--[ % Used BAD
}Else{
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[4] + '</td>' #--[ Used
}
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[5] + '</td>' #--[ Dedup
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[8] + '</td>' #--[ Meta Size
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[9] + '</td>' #--[ Meta Used
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[10] + '</td>' #--[ Meta % Used BAD
If ("{0:D2}" -f [int](($Line.Split(",")[11]).Split("%")[0]) -ge $Script:BadMetaPercent){
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorYel + '><font color=' + $ColorRed + '>' + $Line.Split(",")[11] + '</td>' #--[ Meta % Used
$Script:BadMetaPercent
If ($Console -and $Debug){write-host "Meta %
Used: "("{0:D2}" -f [int](($Line.Split(",")[11]).Split("%")[0])) -ForegroundColor Red }
}Else{
$Script:RowData += '<td
class="myTable" bgcolor=' + $ColorGrey + '><font
color=' + $ColorBla + '>' + $Line.Split(",")[11] + '</td>' #--[ Meta Cap
If ($Console -and $Debug){write-host "Meta %
Used: "("{0:D2}" -f [int](($Line.Split(",")[11]).Split("%")[0]))}
$Script:RowData += '</tr>' #--[ Start table
row ]--
}
$LineCount++
$color++
}
$Script:HTMLData += $Script:RowData
$Script:ReportBody += $Script:HTMLData
$Script:ReportBody += '<tr
class="noBorder"><td colspan='+$Script:Span+'></tr>'
}
$Script:ReportBody += '</table><br><br>'
$Script:ReportBody += "<br>Script
executed on "+$Datetime+"<br><br>"
$Script:ReportBody | Out-File "$Script:FullFileName.html"
SendEmail
[gc]::Collect()
[gc]::WaitForPendingFinalizers()
ResetVariables
if ($Console){Write-Host "---
Completed ---" -ForegroundColor Red }
<#-----------------------------[
Config File ]---------------------------------
The configuration file must be
named "Configuration.xml" and must reside in
the same folder as the
script. Below is the format and element
list:
<!-- Settings &
Configuration File -->
<Settings>
<General>
<ReportName>Zebi
Disk Pool Health Report</ReportName>
<ScriptName>ReplicationReport</ScriptName>
<Targets>
<Target>10.100.1.1</Target>
<Target>10.100.1.2</Target>
<Target>10.100.1.3</Target>
<Target>10.100.1.4</Target>
<Target>10.100.1.5</Target>
<Target>10.100.1.6</Target>
<Target>10.100.1.7</Target>
<Target>10.100.1.8</Target>
</Targets>
</General>
<Email>
<From>WeeklyReports@mydomain.com</From>
<To>me@mydomain.com,you@yourdomain.com</To>
<Subject>Zebi
Disk Pool Status Report</Subject>
<HTML>$true</HTML>
<SmtpServer>10.100.1.10</SmtpServer>
</Email>
<Credentials>
<UserName>zebiadminuser</UserName>
<Password>zebiadminpwd</Password>
</Credentials>
</Settings>
#>