fix(chunk): use hash-based file IDs in SearchChunks resource names (#328)

pinglin · web-flow · commit 67e3513cd124 · 2026-01-31T12:21:24.000Z
**Because**

- The `SearchChunks` endpoint was returning resource names with file
UUIDs (e.g., `files/550e8400-e29b-41d4-a716-446655440000`) instead of
hash-based IDs (e.g., `files/file-abc123`)
- This caused citation URLs to be invalid since the API expects
hash-based file IDs per AIP-122 naming convention
- The `agent:collection:{id}` tags were documented using UID
terminology, which was inconsistent with the actual hash-based ID format
used by agent-backend (e.g., `col-xxx`)

**This commit**

- Updates `SearchChunks` in `pkg/handler/chunk.go` to:
  - Fetch the hash-based file ID along with file UID and display name
- Build `fileUIDMapID` to map file UIDs to their canonical hash-based
IDs
- Use hash-based file IDs in chunk and file resource names returned in
the response
- Add fallback to UUID for legacy data that may not have hash-based IDs
  - Add debug logging to help trace resource name construction
- Renames `extractCollectionUIDs` to `extractCollectionIDs` in
`pkg/handler/converter.go` to reflect that these are hash-based IDs, not
UUIDs
- Updates comments and documentation to clarify the
`agent:collection:{id}` tag format uses hash-based collection IDs (e.g.,
`col-xxx`)
- Updates integration tests to use hash-based ID format (e.g.,
`col-fake-id-123`) instead of UID format
- Updates protobuf comments to document the correct ID format
diff --git a/integration-test/grpc.js b/integration-test/grpc.js
@@ -959,7 +959,7 @@ export function TEST_19_UpdateFileAdmin(data) {
       {
         file: {
           name: `namespaces/${constant.defaultUserId}/knowledge-bases/${testKbId}/files/${fileId}`,
-          tags: ["agent:collection:fake-uid-123", "user-tag"],
+          tags: ["agent:collection:col-fake-id-123", "user-tag"],
         },
         update_mask: { paths: ["tags"] },
       },
@@ -970,7 +970,7 @@ export function TEST_19_UpdateFileAdmin(data) {
       "UpdateFileAdmin returns file": (r) => !!r.message?.file,
       "UpdateFileAdmin can set agent: reserved tags": (r) => {
         const tags = r.message?.file?.tags || [];
-        return tags.includes("agent:collection:fake-uid-123");
+        return tags.includes("agent:collection:col-fake-id-123");
       },
     });
 
diff --git a/integration-test/proto/artifact/v1alpha/artifact_private_service.proto b/integration-test/proto/artifact/v1alpha/artifact_private_service.proto
@@ -27,7 +27,7 @@ service ArtifactPrivateService {
   // Update a file with system-reserved tags (admin only)
   //
   // Updates a file allowing system-reserved tag prefixes like "agent:".
-  // Used by agent-backend to set collection association tags (e.g., "agent:collection:{uid}").
+  // Used by agent-backend to set collection association tags (e.g., "agent:collection:{collectionID}").
   rpc UpdateFileAdmin(UpdateFileAdminRequest) returns (UpdateFileAdminResponse);
 
   // Get Object (admin only)
diff --git a/integration-test/proto/artifact/v1alpha/file.proto b/integration-test/proto/artifact/v1alpha/file.proto
@@ -506,7 +506,7 @@ message ReprocessFileResponse {
 // UpdateFileAdminRequest represents a request to update a file with
 // system-reserved tags (admin only). Used by internal services like
 // agent-backend to set tags with reserved prefixes (e.g.,
-// "agent:collection:{uid}").
+// "agent:collection:{collectionID}").
 // Follows AIP-134: https://google.aip.dev/134
 message UpdateFileAdminRequest {
   // The file resource to update. The file's `name` field identifies the
diff --git a/integration-test/rest.js b/integration-test/rest.js
@@ -1783,7 +1783,7 @@ export function TEST_24_ReservedTagsValidation(data) {
       displayName: data.dbIDPrefix + "agent-tag.txt",
       type: "TYPE_TEXT",
       content: constant.docSampleTxt,
-      tags: ["user-tag", "agent:collection:fake-uid-12345"]
+      tags: ["user-tag", "agent:collection:col-fake-id-12345"]
     };
 
     // API CHANGE: CreateFile now uses /files with knowledgeBaseId query param
@@ -1840,7 +1840,7 @@ export function TEST_24_ReservedTagsValidation(data) {
     // Note: uid removed in AIP refactoring - use id for identification
     if (normalFile && normalFile.id) {
       const updateBody = {
-        tags: ["user-tag", "agent:collection:another-fake-uid"]
+        tags: ["user-tag", "agent:collection:col-another-fake-id"]
       };
 
       // API CHANGE: UpdateFile now uses /files/{file_id}
diff --git a/pkg/handler/chunk.go b/pkg/handler/chunk.go
@@ -14,11 +14,11 @@ import (
 	"github.com/instill-ai/artifact-backend/config"
 	"github.com/instill-ai/artifact-backend/pkg/repository"
 	"github.com/instill-ai/artifact-backend/pkg/types"
+	"github.com/instill-ai/x/resource"
 
 	artifactpb "github.com/instill-ai/protogen-go/artifact/v1alpha"
 	errorsx "github.com/instill-ai/x/errors"
 	logx "github.com/instill-ai/x/log"
-	"github.com/instill-ai/x/resource"
 )
 
 // parseChunkFromName parses a resource name of format:
@@ -529,6 +529,7 @@ func (ph *PublicHandler) SearchChunks(
 		ctx,
 		fileUids,
 		repository.FileColumn.UID,
+		repository.FileColumn.ID,
 		repository.FileColumn.DisplayName,
 	)
 	if err != nil {
@@ -538,8 +539,17 @@ func (ph *PublicHandler) SearchChunks(
 		)
 	}
 
+	// Build maps: FileUID -> DisplayName and FileUID -> hash-based ID
+	fileUIDMapID := make(map[types.FileUIDType]string)
 	for _, file := range files {
 		fileUIDMapDisplayName[file.UID] = file.DisplayName
+		// Use hash-based ID (e.g., "file-abc123") for resource names
+		// Fall back to UID string if ID is empty (legacy data)
+		if file.ID != "" {
+			fileUIDMapID[file.UID] = file.ID
+		} else {
+			fileUIDMapID[file.UID] = file.UID.String()
+		}
 	}
 
 	// Build response with new protobuf format
@@ -550,16 +560,33 @@ func (ph *PublicHandler) SearchChunks(
 			continue
 		}
 
-		// Build full resource names for chunk and file
-		chunkName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s/chunks/%s", namespaceID, kbID, chunk.FileUID.String(), chunk.ID)
-		fileName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s", namespaceID, kbID, chunk.FileUID.String())
+		// Get hash-based file ID for resource names (e.g., "file-abc123")
+		// This ensures the citation URLs use the canonical ID format expected by the API
+		fileID := fileUIDMapID[chunk.FileUID]
+		if fileID == "" {
+			// Fallback to UUID if not found (shouldn't happen normally)
+			fileID = chunk.FileUID.String()
+			logger.Warn("File ID not found in map, using UUID",
+				zap.String("fileUID", chunk.FileUID.String()),
+				zap.String("chunkUID", chunk.UID.String()))
+		}
+
+		// Debug log to verify hash-based ID is being used
+		logger.Debug("SearchChunks building resource name",
+			zap.String("fileUID", chunk.FileUID.String()),
+			zap.String("fileID", fileID),
+			zap.String("chunkID", chunk.ID))
+
+		// Build full resource names for chunk and file using hash-based file ID
+		chunkName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s/chunks/%s", namespaceID, kbID, fileID, chunk.ID)
+		fileName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s", namespaceID, kbID, fileID)
 
 		pbChunk := &artifactpb.SimilarityChunk{
 			Chunk:           chunkName,
 			SimilarityScore: float32(simChunksScores[i].Score),
 			TextContent:     string(chunkContents[i].Content),
 			File:            fileName,
-			ChunkMetadata:   convertToProtoChunk(chunk, namespaceID, kbID, chunk.FileUID.String()),
+			ChunkMetadata:   convertToProtoChunk(chunk, namespaceID, kbID, fileID),
 		}
 		simChunks = append(simChunks, pbChunk)
 	}
diff --git a/pkg/handler/converter.go b/pkg/handler/converter.go
@@ -19,7 +19,7 @@ import (
 // Reserved tag prefixes that users cannot set directly.
 // These are managed by the system.
 var reservedTagPrefixes = []string{
-	"agent:",   // Reserved for agent-backend (e.g., agent:collection:{uid})
+	"agent:",   // Reserved for agent-backend (e.g., agent:collection:{id} where {id} is hash-based like col-xxx)
 	"instill-", // Reserved for internal system use
 }
 
@@ -36,19 +36,20 @@ func validateUserTags(tags []string) error {
 	return nil
 }
 
-// extractCollectionUIDs extracts collection UIDs from tags with prefix "agent:collection:".
-func extractCollectionUIDs(tags []string) []string {
+// extractCollectionIDs extracts collection IDs from tags with prefix "agent:collection:".
+// The IDs are hash-based resource IDs (e.g., col-xxx), not UUIDs.
+func extractCollectionIDs(tags []string) []string {
 	const collectionTagPrefix = "agent:collection:"
-	var collectionUIDs []string
+	var collectionIDs []string
 	for _, tag := range tags {
 		if strings.HasPrefix(tag, collectionTagPrefix) {
-			uid := strings.TrimPrefix(tag, collectionTagPrefix)
-			if uid != "" {
-				collectionUIDs = append(collectionUIDs, uid)
+			id := strings.TrimPrefix(tag, collectionTagPrefix)
+			if id != "" {
+				collectionIDs = append(collectionIDs, id)
 			}
 		}
 	}
-	return collectionUIDs
+	return collectionIDs
 }
 
 // convertKBToCatalogPB converts database KnowledgeBase to protobuf KnowledgeBase.
@@ -130,7 +131,7 @@ func convertKBFileToPB(kbf *repository.FileModel, ns *resource.Namespace, kb *re
 	if len(kbf.Tags) > 0 {
 		file.Tags = kbf.Tags
 		// Extract collection UIDs from tags with prefix "agent:collection:"
-		file.Collections = extractCollectionUIDs(kbf.Tags)
+		file.Collections = extractCollectionIDs(kbf.Tags)
 	}
 
 	if kbf.ExternalMetadataUnmarshal != nil {
diff --git a/pkg/handler/file.go b/pkg/handler/file.go
@@ -621,7 +621,7 @@ func (ph *PublicHandler) CreateFile(ctx context.Context, req *artifactpb.CreateF
 			Object:             objectResourceName,
 			ConvertingPipeline: res.ConvertingPipeline(),
 			Tags:               res.Tags,
-			Collections:        extractCollectionUIDs(res.Tags),
+			Collections:        extractCollectionIDs(res.Tags),
 		},
 	}, nil
 }
@@ -913,7 +913,7 @@ func (ph *PublicHandler) ListFiles(ctx context.Context, req *artifactpb.ListFile
 		DownloadUrl:        downloadURL,
 		ConvertingPipeline: kbFile.ConvertingPipeline(),
 		Tags:               []string(kbFile.Tags),
-		Collections:        extractCollectionUIDs(kbFile.Tags),
+		Collections:        extractCollectionIDs(kbFile.Tags),
 	}
 
 		// Include status message (error or success message)
diff --git a/pkg/handler/private.go b/pkg/handler/private.go
@@ -288,7 +288,7 @@ func (h *PrivateHandler) UpdateKnowledgeBaseAdmin(ctx context.Context, req *arti
 
 // UpdateFileAdmin updates a file with system-reserved tags (admin only).
 // Unlike the public UpdateFile, this endpoint:
-// - Does NOT validate reserved tag prefixes (allows "agent:collection:{uid}" etc.)
+// - Does NOT validate reserved tag prefixes (allows "agent:collection:{id}" where {id} is hash-based like col-xxx)
 // - Does NOT require ACL checks (admin-only access)
 func (h *PrivateHandler) UpdateFileAdmin(ctx context.Context, req *artifactpb.UpdateFileAdminRequest) (*artifactpb.UpdateFileAdminResponse, error) {
 	logger, _ := logx.GetZapLogger(ctx)