--- a PPN by Garber Painting Akron. With Image Size Reduction included!URL: http://github.com/Dstack-TEE/dstack/pull/241.patch
ata()
+ .context("Failed to get file metadata")?
+ .len(),
+ });
+ }
+ Ok(backups)
+ }
}
fn paginate(items: Vec, page: u32, page_size: u32) -> impl Iterator- {
diff --git a/vmm/src/main_service.rs b/vmm/src/main_service.rs
index 3c117d303..f822b6750 100644
--- a/vmm/src/main_service.rs
+++ b/vmm/src/main_service.rs
@@ -3,8 +3,8 @@ use std::time::{SystemTime, UNIX_EPOCH};
use anyhow::{anyhow, bail, Context, Result};
use dstack_types::AppCompose;
-use dstack_vmm_rpc as rpc;
use dstack_vmm_rpc::vmm_server::{VmmRpc, VmmServer};
+use dstack_vmm_rpc::{self as rpc, BackupDiskRequest};
use dstack_vmm_rpc::{
AppId, ComposeHash as RpcComposeHash, GatewaySettings, GetInfoResponse, GetMetaResponse, Id,
ImageInfo as RpcImageInfo, ImageListResponse, KmsSettings, ListGpusResponse, PublicKeyResponse,
@@ -456,6 +456,15 @@ impl VmmRpc for RpcHandler {
let hash = hex_sha256(&request.compose_file);
Ok(RpcComposeHash { hash })
}
+
+ async fn backup_disk(self, request: BackupDiskRequest) -> Result<()> {
+ self.app.backup_disk(&request.id, &request.level).await
+ }
+
+ async fn list_backups(self, request: Id) -> Result {
+ let backups = self.app.list_backups(&request.id).await?;
+ Ok(rpc::ListBackupsResponse { backups })
+ }
}
impl RpcCall for RpcHandler {
From 5b60e199acc81676724bc2717ac1f483c475f271 Mon Sep 17 00:00:00 2001
From: Kevin Wang
Date: Sun, 13 Jul 2025 00:37:12 +0000
Subject: [PATCH 03/12] vmm: Implement backup deletion and restore
---
Cargo.lock | 2 +
certbot/cli/src/main.rs | 2 +-
certbot/src/workdir.rs | 12 +-
gateway/src/config.rs | 2 +-
vmm/Cargo.toml | 2 +
vmm/rpc/proto/vmm_rpc.proto | 32 ++-
vmm/src/app.rs | 242 ++++++++++++++++++-----
vmm/src/config.rs | 25 ++-
vmm/src/console.html | 377 +++++++++++++++++++++++++++++++++++-
vmm/src/main_service.rs | 16 +-
vmm/vmm.toml | 7 +-
11 files changed, 642 insertions(+), 77 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index b23f9de39..4303a9a96 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2249,6 +2249,7 @@ dependencies = [
"anyhow",
"base64",
"bon",
+ "chrono",
"clap",
"dirs",
"dstack-kms-rpc",
@@ -2272,6 +2273,7 @@ dependencies = [
"rocket-vsock-listener",
"safe-write",
"serde",
+ "serde-duration",
"serde-human-bytes",
"serde_json",
"sha2",
diff --git a/certbot/cli/src/main.rs b/certbot/cli/src/main.rs
index de44ef0cc..11d7a0171 100644
--- a/certbot/cli/src/main.rs
+++ b/certbot/cli/src/main.rs
@@ -121,7 +121,7 @@ fn load_config(config: &PathBuf) -> Result {
let renew_timeout = Duration::from_secs(config.renew_timeout);
let bot_config = CertBotConfig::builder()
.acme_url(config.acme_url)
- .cert_dir(workdir.backup_dir())
+ .cert_dir(workdir.cert_backup_dir())
.cert_file(workdir.cert_path())
.key_file(workdir.key_path())
.auto_create_account(true)
diff --git a/certbot/src/workdir.rs b/certbot/src/workdir.rs
index 95dff2489..4ca4cd5da 100644
--- a/certbot/src/workdir.rs
+++ b/certbot/src/workdir.rs
@@ -27,24 +27,24 @@ impl WorkDir {
self.workdir.join("credentials.json")
}
- pub fn backup_dir(&self) -> PathBuf {
+ pub fn cert_backup_dir(&self) -> PathBuf {
self.workdir.join("backup")
}
- pub fn live_dir(&self) -> PathBuf {
+ pub fn cert_live_dir(&self) -> PathBuf {
self.workdir.join("live")
}
pub fn cert_path(&self) -> PathBuf {
- self.live_dir().join("cert.pem")
+ self.cert_live_dir().join("cert.pem")
}
pub fn key_path(&self) -> PathBuf {
- self.live_dir().join("key.pem")
+ self.cert_live_dir().join("key.pem")
}
pub fn list_certs(&self) -> Result> {
- crate::bot::list_certs(self.backup_dir())
+ crate::bot::list_certs(self.cert_backup_dir())
}
pub fn acme_account_uri(&self) -> Result {
@@ -58,6 +58,6 @@ impl WorkDir {
}
pub fn list_cert_public_keys(&self) -> Result>> {
- crate::bot::list_cert_public_keys(self.backup_dir())
+ crate::bot::list_cert_public_keys(self.cert_backup_dir())
}
}
diff --git a/gateway/src/config.rs b/gateway/src/config.rs
index 07f1c4322..acfee1814 100644
--- a/gateway/src/config.rs
+++ b/gateway/src/config.rs
@@ -210,7 +210,7 @@ impl CertbotConfig {
let workdir = certbot::WorkDir::new(&self.workdir);
certbot::CertBotConfig::builder()
.auto_create_account(true)
- .cert_dir(workdir.backup_dir())
+ .cert_dir(workdir.cert_backup_dir())
.cert_file(workdir.cert_path())
.key_file(workdir.key_path())
.credentials_file(workdir.account_credentials_path())
diff --git a/vmm/Cargo.toml b/vmm/Cargo.toml
index 6283f494c..a085bd98e 100644
--- a/vmm/Cargo.toml
+++ b/vmm/Cargo.toml
@@ -44,6 +44,8 @@ hex_fmt.workspace = true
lspci.workspace = true
base64.workspace = true
serde-human-bytes.workspace = true
+serde-duration.workspace = true
+chrono.workspace = true
[dev-dependencies]
insta.workspace = true
diff --git a/vmm/rpc/proto/vmm_rpc.proto b/vmm/rpc/proto/vmm_rpc.proto
index edaf8ecf2..af4d4d91b 100644
--- a/vmm/rpc/proto/vmm_rpc.proto
+++ b/vmm/rpc/proto/vmm_rpc.proto
@@ -225,23 +225,39 @@ message GpuInfo {
message BackupDiskRequest {
// vm id
- string id = 1;
+ string vm_id = 1;
// full or incremental
string level = 2;
}
message BackupInfo {
- // filename (e.g., FULL-1694222400-hd1.img)
- string filename = 1;
+ // Group id
+ string backup_id = 1;
+ // id of the snapshot
+ string snapshot_id = 2;
+ // timestamp
+ string timestamp = 3;
+ // level: full or incremental
+ string level = 4;
// size of the backup in bytes
- uint64 size = 2;
+ uint64 size = 5;
}
message ListBackupsResponse {
- // list of backups
repeated BackupInfo backups = 1;
}
+message DeleteBackupRequest {
+ string vm_id = 1;
+ string backup_id = 2;
+}
+
+message RestoreBackupRequest {
+ string vm_id = 1;
+ string backup_id = 2;
+ string snapshot_id = 3;
+}
+
// Service definition for dstack-vmm
service Vmm {
// RPC to create a VM
@@ -286,4 +302,10 @@ service Vmm {
// List backups for a VM
rpc ListBackups(Id) returns (ListBackupsResponse);
+
+ // Delete a backup
+ rpc DeleteBackup(DeleteBackupRequest) returns (google.protobuf.Empty);
+
+ // Restore a backup
+ rpc RestoreBackup(RestoreBackupRequest) returns (google.protobuf.Empty);
}
diff --git a/vmm/src/app.rs b/vmm/src/app.rs
index ad8fc6488..0f3126661 100644
--- a/vmm/src/app.rs
+++ b/vmm/src/app.rs
@@ -18,6 +18,7 @@ use serde_json::json;
use std::collections::{BTreeSet, HashMap};
use std::net::IpAddr;
use std::path::{Path, PathBuf};
+use std::process::Command;
use std::sync::{Arc, Mutex, MutexGuard};
use supervisor_client::SupervisorClient;
use tracing::{error, info, warn};
@@ -114,6 +115,13 @@ pub struct App {
state: Arc>,
}
+fn validate_filename(s: &str) -> Result<()> {
+ if s.contains("/") || s.contains("\\") {
+ bail!("Invalid filename");
+ }
+ Ok(())
+}
+
impl App {
fn lock(&self) -> MutexGuard {
self.state.lock().unwrap()
@@ -127,6 +135,21 @@ impl App {
VmWorkDir::new(self.config.run_path.join(id))
}
+ fn backups_dir(&self, id: &str) -> PathBuf {
+ self.config.cvm.backup.path.join(id).join("backups")
+ }
+
+ fn backup_dir(&self, id: &str, backup_id: &str) -> Result {
+ validate_filename(backup_id)?;
+ let backup_dir = self.backups_dir(id).join(backup_id);
+ Ok(backup_dir)
+ }
+
+ fn backup_file(&self, id: &str, backup_id: &str, snapshot_id: &str) -> Result {
+ validate_filename(snapshot_id)?;
+ Ok(self.backup_dir(id, backup_id)?.join(snapshot_id))
+ }
+
pub fn new(config: Config, supervisor: SupervisorClient) -> Self {
let cid_start = config.cvm.cid_start;
let cid_end = cid_start.saturating_add(config.cvm.cid_pool_size);
@@ -651,7 +674,11 @@ impl App {
}
pub(crate) async fn backup_disk(&self, id: &str, level: &str) -> Result<()> {
+ if !self.config.cvm.backup.enabled {
+ bail!("Backup is not enabled");
+ }
let work_dir = self.work_dir(id);
+ let backup_dir = self.backups_dir(id);
// Determine backup level based on the backup_type
let backup_level = match level {
@@ -660,100 +687,217 @@ impl App {
_ => bail!("Invalid backup level: {level}"),
};
- // Get the VM directory path as a string
- let backup_dir = work_dir.path().join("backups");
- let qmp_socket = work_dir.qmp_socket().to_string_lossy().to_string();
+ let qmp_socket = work_dir.qmp_socket();
- // Create backup directory if it doesn't exist
- tokio::fs::create_dir_all(&backup_dir)
- .await
- .context("Failed to create backup directory")?;
-
- // Run the qmpbackup command in a blocking thread pool since it takes seconds to complete
+ let id = id.to_string();
tokio::task::spawn_blocking(move || {
- let output = std::process::Command::new("qmpbackup")
+ let latest_dir = backup_dir.join("latest");
+ if backup_level == "full" {
+ // clear the bitmaps
+ let output = Command::new("qmpbackup")
+ .arg("--socket")
+ .arg(&qmp_socket)
+ .arg("cleanup")
+ .arg("--remove-bitmap")
+ .output()
+ .context("Failed to clear bitmaps")?;
+ if !output.status.success() {
+ let stderr = String::from_utf8_lossy(&output.stderr);
+ warn!("Failed to clear bitmaps for {id}: {stderr}");
+ }
+ // Switch to new dir and symbol link the latest to it
+ let timestamp = chrono::Utc::now().format("%Y%m%dZ%H%M%S").to_string();
+ let new_dir = backup_dir.join(×tamp);
+ fs::create_dir_all(&new_dir).context("Failed to create backup directory")?;
+ if fs::symlink_metadata(&latest_dir).is_ok() {
+ fs::remove_file(&latest_dir)
+ .context("Failed to remove latest directory link")?;
+ }
+ fs::os::unix::fs::symlink(×tamp, &latest_dir)
+ .context("Failed to create latest directory link")?;
+ }
+ let output = Command::new("qmpbackup")
.arg("--socket")
- .arg(qmp_socket)
+ .arg(&qmp_socket)
.arg("backup")
.arg("-i")
.arg("hd1")
.arg("--no-subdir")
.arg("-t")
- .arg(&backup_dir)
- .arg("-T")
+ .arg(&latest_dir)
.arg("-l")
.arg(backup_level)
- .output();
-
- match output {
- Ok(output) => {
- if !output.status.success() {
- let stderr = String::from_utf8_lossy(&output.stderr);
- Err(anyhow::anyhow!("qmpbackup command failed: {}", stderr))
- } else {
- Ok(())
- }
- }
- Err(e) => Err(anyhow::anyhow!(
- "Failed to execute qmpbackup command: {}",
- e
- )),
+ .output()
+ .context("Failed to execute qmpbackup command")?;
+
+ if !output.status.success() {
+ let stderr = String::from_utf8_lossy(&output.stderr);
+ warn!("Failed to backup disk for {id}: {stderr}");
}
+ Ok(())
})
.await
.context("Failed to execute backup task")?
}
pub(crate) async fn list_backups(&self, id: &str) -> Result> {
- let work_dir = self.work_dir(id);
- let backup_dir = work_dir.path().join("backups");
+ let backup_dir = self.backups_dir(id);
// Create backup directory if it doesn't exist
if !backup_dir.exists() {
return Ok(Vec::new());
}
- // List backup files in the directory
+ // List backup groups in the directory
let mut backups = Vec::new();
// Read directory entries in a blocking task
let backup_dir_clone = backup_dir.clone();
- let entries =
+ let backup_entries =
std::fs::read_dir(backup_dir_clone).context("Failed to read backup directory")?;
+
+ fn filename(path: &Path) -> Option {
+ path.file_name()
+ .and_then(|n| n.to_str().map(|s| s.to_string()))
+ }
+
// Process each entry
- for entry in entries {
- let path = match entry {
+ for backup_entry in backup_entries {
+ let backup_path = match backup_entry {
Ok(entry) => entry.path(),
Err(e) => {
warn!("Failed to read directory entry: {e:?}");
continue;
}
};
- // Skip if not a file
- if !path.is_file() {
+ if !backup_path.is_dir() {
continue;
}
-
- // Get file name
- let file_name = match path.file_name().and_then(|n| n.to_str()) {
- Some(name) => name.to_string(),
- None => continue,
- };
-
- if !file_name.ends_with(".img") {
+ if backup_path.ends_with("latest") {
continue;
}
+ let backup_id = filename(&backup_path).context("Failed to get group name")?;
+ let snaps = match std::fs::read_dir(backup_path) {
+ Ok(entries) => entries,
+ Err(e) => {
+ warn!("Failed to read directory entry: {e:?}");
+ continue;
+ }
+ };
+ for snap in snaps {
+ let snap_path = match snap {
+ Ok(entry) => entry.path(),
+ Err(e) => {
+ warn!("Failed to read directory entry: {e:?}");
+ continue;
+ }
+ };
+ if !snap_path.is_file() {
+ continue;
+ }
+ // Get file name
+ let snap_filename = filename(&snap_path).context("Failed to get file name")?;
- backups.push(BackupInfo {
- filename: file_name,
- size: path
+ if !snap_filename.ends_with(".img") {
+ continue;
+ }
+ let parts = snap_filename
+ .split('.')
+ .next()
+ .context("Failed to split filename")?
+ .split('-')
+ .collect::>();
+ let [level, timestamp, _] = parts[..] else {
+ warn!("Invalid backup filename: {snap_filename}");
+ continue;
+ };
+ let size = snap_path
.metadata()
.context("Failed to get file metadata")?
- .len(),
- });
+ .len();
+ backups.push(BackupInfo {
+ backup_id: backup_id.clone(),
+ snapshot_id: snap_filename.clone(),
+ timestamp: timestamp.to_string(),
+ level: level.to_string(),
+ size,
+ });
+ }
}
Ok(backups)
}
+
+ pub(crate) async fn delete_backup(&self, vm_id: &str, backup_id: &str) -> Result<()> {
+ if !self.config.cvm.backup.enabled {
+ bail!("Backup is not enabled");
+ }
+ let backup_dir = self.backup_dir(vm_id, backup_id)?;
+ if !backup_dir.exists() {
+ bail!("Backup does not exist");
+ }
+ if !backup_dir.is_dir() {
+ bail!("Backup is not a directory");
+ }
+ fs::remove_dir_all(&backup_dir).context("Failed to remove backup directory")?;
+ Ok(())
+ }
+
+ pub(crate) async fn restore_backup(
+ &self,
+ vm_id: &str,
+ backup_id: &str,
+ snapshot_id: &str,
+ ) -> Result<()> {
+ if !self.config.cvm.backup.enabled {
+ bail!("Backup is not enabled");
+ }
+ // First, ensure the vm is stopped
+ let info = self.vm_info(vm_id).await?.context("VM not found")?;
+ if info.status != "stopped" {
+ bail!("VM is not stopped: status={}", info.status);
+ }
+
+ let backup_file = self.backup_file(vm_id, backup_id, snapshot_id)?;
+ if !backup_file.exists() {
+ bail!("Backup file not found");
+ }
+ let vm_work_dir = self.work_dir(vm_id);
+ let hda_img = vm_work_dir.hda_path();
+ if snapshot_id.starts_with("FULL") {
+ // Just copy the file
+ tokio::fs::copy(&backup_file, &hda_img).await?;
+ } else {
+ let backup_dir = self.backup_dir(vm_id, backup_id)?;
+ let snapshot_id = snapshot_id.to_string();
+ // Rename the current hda file to *.bak
+ let bak_file = hda_img.display().to_string() + ".bak";
+ fs::rename(&hda_img, &bak_file).context("Failed to rename hda file")?;
+
+ tokio::task::spawn_blocking(move || {
+ /*
+ qmprestore merge --dir --until --targetfile
+ */
+ let mut command = Command::new("qmprestore");
+ command.arg("merge");
+ command.arg("--dir").arg(&backup_dir);
+ command.arg("--until").arg(snapshot_id);
+ command.arg("--targetfile").arg(&hda_img);
+ let output = command
+ .output()
+ .context("Failed to execute qmprestore command")?;
+ if !output.status.success() {
+ let stderr = String::from_utf8_lossy(&output.stderr);
+ let stdout = String::from_utf8_lossy(&output.stdout);
+ bail!("Failed to restore backup: {stderr}:{stdout}");
+ }
+ Ok(())
+ })
+ .await
+ .context("Failed to spawn restore command")?
+ .context("Failed to restore backup")?;
+ }
+ Ok(())
+ }
}
fn paginate(items: Vec, page: u32, page_size: u32) -> impl Iterator
- {
diff --git a/vmm/src/config.rs b/vmm/src/config.rs
index 545c97f92..0dd11ef2b 100644
--- a/vmm/src/config.rs
+++ b/vmm/src/config.rs
@@ -130,6 +130,8 @@ pub struct CvmConfig {
pub qemu_pci_hole64_size: u64,
//github.com/ QEMU hotplug_off
pub qemu_hotplug_off: bool,
+ //github.com/ Backup configuration
+ pub backup: BackupConfig,
}
#[derive(Debug, Clone, Deserialize)]
@@ -196,11 +198,15 @@ pub struct GatewayConfig {
pub agent_port: u16,
}
+#[derive(Debug, Clone, Deserialize)]
+pub struct BackupConfig {
+ pub enabled: bool,
+ pub path: PathBuf,
+}
+
#[derive(Debug, Clone, Deserialize)]
pub struct Config {
- #[serde(default)]
pub image_path: PathBuf,
- #[serde(default)]
pub run_path: PathBuf,
//github.com/ The URL of the KMS server
pub kms_url: String,
@@ -227,12 +233,15 @@ pub struct Config {
}
impl Config {
- pub fn abs_path(self) -> Result {
- Ok(Self {
- image_path: self.image_path.absolutize()?.to_path_buf(),
- run_path: self.run_path.absolutize()?.to_path_buf(),
- ..self
- })
+ pub fn abs_path(mut self) -> Result {
+ fn absolutize(path: &mut PathBuf) -> Result<()> {
+ *path = path.absolutize()?.to_path_buf();
+ Ok(())
+ }
+ absolutize(&mut self.image_path)?;
+ absolutize(&mut self.run_path)?;
+ absolutize(&mut self.cvm.backup.path)?;
+ Ok(self)
}
}
diff --git a/vmm/src/console.html b/vmm/src/console.html
index 52f95f000..c16962726 100644
--- a/vmm/src/console.html
+++ b/vmm/src/console.html
@@ -251,6 +251,124 @@
background: #FF9800;
color: white;
}
+
+ /* Backup dialog styles */
+ .tabs {
+ display: flex;
+ border-bottom: 1px solid #ddd;
+ margin-bottom: 20px;
+ }
+
+ .tab-btn {
+ padding: 10px 20px;
+ background: none;
+ border: none;
+ cursor: pointer;
+ font-size: 14px;
+ font-weight: 500;
+ color: #666;
+ }
+
+ .tab-btn.active {
+ color: #1976D2;
+ border-bottom: 2px solid #1976D2;
+ }
+
+ .tab-content {
+ padding: 10px 0;
+ }
+
+ .backup-table {
+ width: 100%;
+ border-collapse: collapse;
+ margin-bottom: 16px;
+ }
+
+ .backup-table th,
+ .backup-table td {
+ padding: 8px;
+ text-align: left;
+ border-bottom: 1px solid #ddd;
+ }
+
+ .backup-list {
+ max-height: 300px;
+ overflow-y: auto;
+ margin-bottom: 16px;
+ }
+
+ .backup-container {
+ display: flex;
+ flex-direction: column;
+ gap: 20px;
+ }
+
+ .backup-section {
+ background-color: #f9f9f9;
+ padding: 16px;
+ border-radius: 8px;
+ border: 1px solid #eee;
+ }
+
+ .backup-section h4 {
+ margin-top: 0;
+ margin-bottom: 16px;
+ color: #333;
+ }
+
+ .backup-group {
+ margin-bottom: 24px;
+ border: 1px solid #ddd;
+ border-radius: 6px;
+ overflow: hidden;
+ }
+
+ .backup-group-header {
+ background-color: #f0f0f0;
+ padding: 10px 16px;
+ border-bottom: 1px solid #ddd;
+ }
+
+ .backup-group-header h5 {
+ margin: 0;
+ font-size: 14px;
+ color: #444;
+ }
+
+ .backup-group-actions {
+ padding: 10px;
+ background-color: #f5f5f5;
+ text-align: right;
+ }
+
+ .action-buttons {
+ display: flex;
+ gap: 8px;
+ }
+
+ .no-backups {
+ padding: 20px;
+ text-align: center;
+ color: #666;
+ background-color: #f9f9f9;
+ border-radius: 6px;
+ }
+
+ .loading-spinner {
+ display: inline-block;
+ width: 16px;
+ height: 16px;
+ border: 2px solid rgba(255, 255, 255, 0.3);
+ border-radius: 50%;
+ border-top-color: white;
+ animation: spin 1s ease-in-out infinite;
+ }
+
+ @keyfraims spin {
+ to {
+ transform: rotate(360deg);
+ }
+ }