-
Notifications
You must be signed in to change notification settings - Fork 57
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(analytics): GPU Usage #580
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,163 @@ | ||
import { cacheKeys, cacheResponse } from "@src/caching/helpers"; | ||
import { chainDb } from "@src/db/dbConnection"; | ||
import { QueryTypes } from "sequelize"; | ||
|
||
type GpuUtilizationData = { | ||
date: Date; | ||
cpuUtilization: number; | ||
cpu: number; | ||
gpuUtilization: number; | ||
gpu: number; | ||
count: number; | ||
node_count: number; | ||
}; | ||
|
||
type GpuBreakdownData = { | ||
date: Date; | ||
vendor: string; | ||
model: string; | ||
providerCount: number; | ||
nodeCount: number; | ||
totalGpus: number; | ||
leasedGpus: number; | ||
gpuUtilization: number; | ||
}; | ||
|
||
export async function getGpuUtilization() { | ||
return await cacheResponse( | ||
60 * 5, // 5 minutes | ||
cacheKeys.getGpuUtilization, | ||
async () => { | ||
const result = await chainDb.query<GpuUtilizationData>( | ||
`SELECT | ||
d."date", | ||
ROUND( | ||
COALESCE((SUM("activeCPU") + SUM("pendingCPU")) * 100.0 / | ||
NULLIF(SUM("activeCPU") + SUM("pendingCPU") + SUM("availableCPU"), 0), 0), | ||
2 | ||
)::float AS "cpuUtilization", | ||
COALESCE(SUM("activeCPU") + SUM("pendingCPU") + SUM("availableCPU"), 0)::integer AS "cpu", | ||
ROUND( | ||
COALESCE((SUM("activeGPU") + SUM("pendingGPU")) * 100.0 / | ||
NULLIF(SUM("activeGPU") + SUM("pendingGPU") + SUM("availableGPU"), 0), 0), | ||
2 | ||
)::float AS "gpuUtilization", | ||
COALESCE(SUM("activeGPU") + SUM("pendingGPU") + SUM("availableGPU"), 0)::integer AS "gpu", | ||
COUNT(*) as provider_count, | ||
COALESCE(COUNT(DISTINCT "nodeId"), 0) as node_count | ||
FROM "day" d | ||
INNER JOIN ( | ||
SELECT DISTINCT ON("hostUri",DATE("checkDate")) | ||
DATE("checkDate") AS date, | ||
ps."activeCPU", ps."pendingCPU", ps."availableCPU", | ||
ps."activeGPU", ps."pendingGPU", ps."availableGPU", | ||
ps."isOnline", | ||
n.id as "nodeId" | ||
FROM "providerSnapshot" ps | ||
INNER JOIN "provider" ON "provider"."owner"=ps."owner" | ||
INNER JOIN "providerSnapshotNode" n ON n."snapshotId"=ps.id AND n."gpuAllocatable" > 0 | ||
LEFT JOIN "providerSnapshotNodeGPU" gpu ON gpu."snapshotNodeId" = n.id | ||
WHERE ps."isLastSuccessOfDay" = TRUE | ||
ORDER BY "hostUri",DATE("checkDate"),"checkDate" DESC | ||
) "dailyProviderStats" | ||
ON DATE(d."date")="dailyProviderStats"."date" | ||
GROUP BY d."date" | ||
ORDER BY d."date" ASC`, | ||
{ | ||
type: QueryTypes.SELECT | ||
} | ||
); | ||
|
||
const stats = result.map(day => ({ | ||
date: day.date, | ||
value: day.gpuUtilization | ||
})); | ||
|
||
return { | ||
currentValue: stats[stats.length - 1]?.value ?? 0, | ||
compareValue: stats[stats.length - 2]?.value ?? 0, | ||
snapshots: stats | ||
}; | ||
}, | ||
true | ||
); | ||
} | ||
|
||
export async function getGpuBreakdownByVendorAndModel(): Promise<GpuBreakdownData[]> { | ||
return await cacheResponse( | ||
60 * 5, // 5 minutes | ||
cacheKeys.getGpuBreakdown, | ||
async () => { | ||
const result = await chainDb.query<{ | ||
date: Date; | ||
vendor: string; | ||
model: string; | ||
provider_count: number; | ||
node_count: number; | ||
total_gpus: number; | ||
leased_gpus: number; | ||
gpuUtilization: number; | ||
}>( | ||
`SELECT | ||
d."date", | ||
COALESCE(gpu."vendor", 'Unknown') as "vendor", | ||
COALESCE(gpu."name", 'Unknown') as "model", | ||
COALESCE(COUNT(DISTINCT "dailyProviderStats"."hostUri"), 0) as provider_count, | ||
COALESCE(COUNT(DISTINCT n.id), 0) as node_count, | ||
COALESCE(COUNT(gpu.id), 0) as total_gpus, | ||
COALESCE(CAST(ROUND(SUM( | ||
CAST(n."gpuAllocated" as float) / | ||
NULLIF((SELECT COUNT(*) | ||
FROM "providerSnapshotNodeGPU" subgpu | ||
WHERE subgpu."snapshotNodeId" = n.id), 0) | ||
)) as int), 0) as leased_gpus, | ||
CAST(COALESCE( | ||
SUM( | ||
CAST(n."gpuAllocated" as float) / | ||
NULLIF((SELECT COUNT(*) | ||
FROM "providerSnapshotNodeGPU" subgpu | ||
WHERE subgpu."snapshotNodeId" = n.id), 0) | ||
) * 100.0 / NULLIF(COUNT(gpu.id), 0) | ||
, 0) as numeric(10,2)) as "gpuUtilization" | ||
FROM "day" d | ||
INNER JOIN ( | ||
SELECT DISTINCT ON("hostUri", DATE("checkDate")) | ||
ps.id as "snapshotId", | ||
"hostUri", | ||
DATE("checkDate") AS date, | ||
ps."isOnline" | ||
FROM "providerSnapshot" ps | ||
INNER JOIN "provider" ON "provider"."owner" = ps."owner" | ||
WHERE ps."isLastSuccessOfDay" = TRUE | ||
ORDER BY "hostUri", DATE("checkDate"), "checkDate" DESC | ||
) "dailyProviderStats" ON DATE(d."date") = "dailyProviderStats"."date" | ||
INNER JOIN "providerSnapshotNode" n ON n."snapshotId" = "dailyProviderStats"."snapshotId" AND n."gpuAllocatable" > 0 | ||
LEFT JOIN "providerSnapshotNodeGPU" gpu ON gpu."snapshotNodeId" = n.id | ||
GROUP BY d."date", gpu."vendor", gpu."name" | ||
ORDER BY d."date" ASC, gpu."vendor", gpu."name"`, | ||
{ | ||
type: QueryTypes.SELECT | ||
} | ||
); | ||
|
||
return result.map(row => ({ | ||
date: row.date, | ||
vendor: row.vendor, | ||
model: row.model, | ||
providerCount: row.provider_count, | ||
nodeCount: row.node_count, | ||
totalGpus: row.total_gpus, | ||
leasedGpus: row.leased_gpus, | ||
gpuUtilization: row.gpuUtilization | ||
})); | ||
}, | ||
true | ||
); | ||
} | ||
|
||
export async function getLatestGpuBreakdown(): Promise<GpuBreakdownData[]> { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we expose this method as a documented route please? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On it - okay to take as a follow-up PR @baktun14? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can add it to this pr, you just need to add a route. |
||
const allData = await getGpuBreakdownByVendorAndModel(); | ||
const latestDate = allData.reduce((latest, current) => (latest > current.date ? latest : current.date), new Date(0)); | ||
|
||
return allData.filter(data => data.date.getTime() === latestDate.getTime()); | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
there are relevant constants in
date-fns
, the comment wouldn't be needed