Skip to content

Commit 67e3513

Browse files
authored
fix(chunk): use hash-based file IDs in SearchChunks resource names (#328)
**Because** - The `SearchChunks` endpoint was returning resource names with file UUIDs (e.g., `files/550e8400-e29b-41d4-a716-446655440000`) instead of hash-based IDs (e.g., `files/file-abc123`) - This caused citation URLs to be invalid since the API expects hash-based file IDs per AIP-122 naming convention - The `agent:collection:{id}` tags were documented using UID terminology, which was inconsistent with the actual hash-based ID format used by agent-backend (e.g., `col-xxx`) **This commit** - Updates `SearchChunks` in `pkg/handler/chunk.go` to: - Fetch the hash-based file ID along with file UID and display name - Build `fileUIDMapID` to map file UIDs to their canonical hash-based IDs - Use hash-based file IDs in chunk and file resource names returned in the response - Add fallback to UUID for legacy data that may not have hash-based IDs - Add debug logging to help trace resource name construction - Renames `extractCollectionUIDs` to `extractCollectionIDs` in `pkg/handler/converter.go` to reflect that these are hash-based IDs, not UUIDs - Updates comments and documentation to clarify the `agent:collection:{id}` tag format uses hash-based collection IDs (e.g., `col-xxx`) - Updates integration tests to use hash-based ID format (e.g., `col-fake-id-123`) instead of UID format - Updates protobuf comments to document the correct ID format
1 parent 748507f commit 67e3513

File tree

8 files changed

+51
-23
lines changed

8 files changed

+51
-23
lines changed

integration-test/grpc.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -959,7 +959,7 @@ export function TEST_19_UpdateFileAdmin(data) {
959959
{
960960
file: {
961961
name: `namespaces/${constant.defaultUserId}/knowledge-bases/${testKbId}/files/${fileId}`,
962-
tags: ["agent:collection:fake-uid-123", "user-tag"],
962+
tags: ["agent:collection:col-fake-id-123", "user-tag"],
963963
},
964964
update_mask: { paths: ["tags"] },
965965
},
@@ -970,7 +970,7 @@ export function TEST_19_UpdateFileAdmin(data) {
970970
"UpdateFileAdmin returns file": (r) => !!r.message?.file,
971971
"UpdateFileAdmin can set agent: reserved tags": (r) => {
972972
const tags = r.message?.file?.tags || [];
973-
return tags.includes("agent:collection:fake-uid-123");
973+
return tags.includes("agent:collection:col-fake-id-123");
974974
},
975975
});
976976

integration-test/proto/artifact/v1alpha/artifact_private_service.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ service ArtifactPrivateService {
2727
// Update a file with system-reserved tags (admin only)
2828
//
2929
// Updates a file allowing system-reserved tag prefixes like "agent:".
30-
// Used by agent-backend to set collection association tags (e.g., "agent:collection:{uid}").
30+
// Used by agent-backend to set collection association tags (e.g., "agent:collection:{collectionID}").
3131
rpc UpdateFileAdmin(UpdateFileAdminRequest) returns (UpdateFileAdminResponse);
3232

3333
// Get Object (admin only)

integration-test/proto/artifact/v1alpha/file.proto

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -506,7 +506,7 @@ message ReprocessFileResponse {
506506
// UpdateFileAdminRequest represents a request to update a file with
507507
// system-reserved tags (admin only). Used by internal services like
508508
// agent-backend to set tags with reserved prefixes (e.g.,
509-
// "agent:collection:{uid}").
509+
// "agent:collection:{collectionID}").
510510
// Follows AIP-134: https://google.aip.dev/134
511511
message UpdateFileAdminRequest {
512512
// The file resource to update. The file's `name` field identifies the

integration-test/rest.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1783,7 +1783,7 @@ export function TEST_24_ReservedTagsValidation(data) {
17831783
displayName: data.dbIDPrefix + "agent-tag.txt",
17841784
type: "TYPE_TEXT",
17851785
content: constant.docSampleTxt,
1786-
tags: ["user-tag", "agent:collection:fake-uid-12345"]
1786+
tags: ["user-tag", "agent:collection:col-fake-id-12345"]
17871787
};
17881788

17891789
// API CHANGE: CreateFile now uses /files with knowledgeBaseId query param
@@ -1840,7 +1840,7 @@ export function TEST_24_ReservedTagsValidation(data) {
18401840
// Note: uid removed in AIP refactoring - use id for identification
18411841
if (normalFile && normalFile.id) {
18421842
const updateBody = {
1843-
tags: ["user-tag", "agent:collection:another-fake-uid"]
1843+
tags: ["user-tag", "agent:collection:col-another-fake-id"]
18441844
};
18451845

18461846
// API CHANGE: UpdateFile now uses /files/{file_id}

pkg/handler/chunk.go

Lines changed: 32 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -14,11 +14,11 @@ import (
1414
"github.com/instill-ai/artifact-backend/config"
1515
"github.com/instill-ai/artifact-backend/pkg/repository"
1616
"github.com/instill-ai/artifact-backend/pkg/types"
17+
"github.com/instill-ai/x/resource"
1718

1819
artifactpb "github.com/instill-ai/protogen-go/artifact/v1alpha"
1920
errorsx "github.com/instill-ai/x/errors"
2021
logx "github.com/instill-ai/x/log"
21-
"github.com/instill-ai/x/resource"
2222
)
2323

2424
// parseChunkFromName parses a resource name of format:
@@ -529,6 +529,7 @@ func (ph *PublicHandler) SearchChunks(
529529
ctx,
530530
fileUids,
531531
repository.FileColumn.UID,
532+
repository.FileColumn.ID,
532533
repository.FileColumn.DisplayName,
533534
)
534535
if err != nil {
@@ -538,8 +539,17 @@ func (ph *PublicHandler) SearchChunks(
538539
)
539540
}
540541

542+
// Build maps: FileUID -> DisplayName and FileUID -> hash-based ID
543+
fileUIDMapID := make(map[types.FileUIDType]string)
541544
for _, file := range files {
542545
fileUIDMapDisplayName[file.UID] = file.DisplayName
546+
// Use hash-based ID (e.g., "file-abc123") for resource names
547+
// Fall back to UID string if ID is empty (legacy data)
548+
if file.ID != "" {
549+
fileUIDMapID[file.UID] = file.ID
550+
} else {
551+
fileUIDMapID[file.UID] = file.UID.String()
552+
}
543553
}
544554

545555
// Build response with new protobuf format
@@ -550,16 +560,33 @@ func (ph *PublicHandler) SearchChunks(
550560
continue
551561
}
552562

553-
// Build full resource names for chunk and file
554-
chunkName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s/chunks/%s", namespaceID, kbID, chunk.FileUID.String(), chunk.ID)
555-
fileName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s", namespaceID, kbID, chunk.FileUID.String())
563+
// Get hash-based file ID for resource names (e.g., "file-abc123")
564+
// This ensures the citation URLs use the canonical ID format expected by the API
565+
fileID := fileUIDMapID[chunk.FileUID]
566+
if fileID == "" {
567+
// Fallback to UUID if not found (shouldn't happen normally)
568+
fileID = chunk.FileUID.String()
569+
logger.Warn("File ID not found in map, using UUID",
570+
zap.String("fileUID", chunk.FileUID.String()),
571+
zap.String("chunkUID", chunk.UID.String()))
572+
}
573+
574+
// Debug log to verify hash-based ID is being used
575+
logger.Debug("SearchChunks building resource name",
576+
zap.String("fileUID", chunk.FileUID.String()),
577+
zap.String("fileID", fileID),
578+
zap.String("chunkID", chunk.ID))
579+
580+
// Build full resource names for chunk and file using hash-based file ID
581+
chunkName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s/chunks/%s", namespaceID, kbID, fileID, chunk.ID)
582+
fileName := fmt.Sprintf("namespaces/%s/knowledge-bases/%s/files/%s", namespaceID, kbID, fileID)
556583

557584
pbChunk := &artifactpb.SimilarityChunk{
558585
Chunk: chunkName,
559586
SimilarityScore: float32(simChunksScores[i].Score),
560587
TextContent: string(chunkContents[i].Content),
561588
File: fileName,
562-
ChunkMetadata: convertToProtoChunk(chunk, namespaceID, kbID, chunk.FileUID.String()),
589+
ChunkMetadata: convertToProtoChunk(chunk, namespaceID, kbID, fileID),
563590
}
564591
simChunks = append(simChunks, pbChunk)
565592
}

pkg/handler/converter.go

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ import (
1919
// Reserved tag prefixes that users cannot set directly.
2020
// These are managed by the system.
2121
var reservedTagPrefixes = []string{
22-
"agent:", // Reserved for agent-backend (e.g., agent:collection:{uid})
22+
"agent:", // Reserved for agent-backend (e.g., agent:collection:{id} where {id} is hash-based like col-xxx)
2323
"instill-", // Reserved for internal system use
2424
}
2525

@@ -36,19 +36,20 @@ func validateUserTags(tags []string) error {
3636
return nil
3737
}
3838

39-
// extractCollectionUIDs extracts collection UIDs from tags with prefix "agent:collection:".
40-
func extractCollectionUIDs(tags []string) []string {
39+
// extractCollectionIDs extracts collection IDs from tags with prefix "agent:collection:".
40+
// The IDs are hash-based resource IDs (e.g., col-xxx), not UUIDs.
41+
func extractCollectionIDs(tags []string) []string {
4142
const collectionTagPrefix = "agent:collection:"
42-
var collectionUIDs []string
43+
var collectionIDs []string
4344
for _, tag := range tags {
4445
if strings.HasPrefix(tag, collectionTagPrefix) {
45-
uid := strings.TrimPrefix(tag, collectionTagPrefix)
46-
if uid != "" {
47-
collectionUIDs = append(collectionUIDs, uid)
46+
id := strings.TrimPrefix(tag, collectionTagPrefix)
47+
if id != "" {
48+
collectionIDs = append(collectionIDs, id)
4849
}
4950
}
5051
}
51-
return collectionUIDs
52+
return collectionIDs
5253
}
5354

5455
// convertKBToCatalogPB converts database KnowledgeBase to protobuf KnowledgeBase.
@@ -130,7 +131,7 @@ func convertKBFileToPB(kbf *repository.FileModel, ns *resource.Namespace, kb *re
130131
if len(kbf.Tags) > 0 {
131132
file.Tags = kbf.Tags
132133
// Extract collection UIDs from tags with prefix "agent:collection:"
133-
file.Collections = extractCollectionUIDs(kbf.Tags)
134+
file.Collections = extractCollectionIDs(kbf.Tags)
134135
}
135136

136137
if kbf.ExternalMetadataUnmarshal != nil {

pkg/handler/file.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -621,7 +621,7 @@ func (ph *PublicHandler) CreateFile(ctx context.Context, req *artifactpb.CreateF
621621
Object: objectResourceName,
622622
ConvertingPipeline: res.ConvertingPipeline(),
623623
Tags: res.Tags,
624-
Collections: extractCollectionUIDs(res.Tags),
624+
Collections: extractCollectionIDs(res.Tags),
625625
},
626626
}, nil
627627
}
@@ -913,7 +913,7 @@ func (ph *PublicHandler) ListFiles(ctx context.Context, req *artifactpb.ListFile
913913
DownloadUrl: downloadURL,
914914
ConvertingPipeline: kbFile.ConvertingPipeline(),
915915
Tags: []string(kbFile.Tags),
916-
Collections: extractCollectionUIDs(kbFile.Tags),
916+
Collections: extractCollectionIDs(kbFile.Tags),
917917
}
918918

919919
// Include status message (error or success message)

pkg/handler/private.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -288,7 +288,7 @@ func (h *PrivateHandler) UpdateKnowledgeBaseAdmin(ctx context.Context, req *arti
288288

289289
// UpdateFileAdmin updates a file with system-reserved tags (admin only).
290290
// Unlike the public UpdateFile, this endpoint:
291-
// - Does NOT validate reserved tag prefixes (allows "agent:collection:{uid}" etc.)
291+
// - Does NOT validate reserved tag prefixes (allows "agent:collection:{id}" where {id} is hash-based like col-xxx)
292292
// - Does NOT require ACL checks (admin-only access)
293293
func (h *PrivateHandler) UpdateFileAdmin(ctx context.Context, req *artifactpb.UpdateFileAdminRequest) (*artifactpb.UpdateFileAdminResponse, error) {
294294
logger, _ := logx.GetZapLogger(ctx)

0 commit comments

Comments
 (0)