-
Notifications
You must be signed in to change notification settings - Fork 38
Open
Description
Test Code
@Test
public void testAddEmbedding() {
prepareDataset();
Dataset<Row> result = spark.table(fullTable);
assertEquals(10, result.count(), "Should have 10 rows");
result = result.select("_rowaddr", "_fragid", "id");
UDF1<Object, float[]> randFunc = obj -> {
Random rnd = new Random();
return new float[]{rnd.nextFloat(), rnd.nextFloat(), rnd.nextFloat()};
};
spark.udf().register("randFloat3", randFunc,
DataTypes.createArrayType(DataTypes.FloatType, false));
Metadata meta = new MetadataBuilder()
.putLong("arrow.fixed-size-list.size", 3L)
.build();
Dataset<Row> df = result
.withColumn("randFloat3", callUDF("randFloat3", col("id")), meta);
df.createOrReplaceTempView("tmp_view");
spark.sql(String.format("alter table %s add columns randFloat3 from tmp_view", fullTable));
}Exception
Caused by: org.apache.spark.SparkException: FixedSizeList field randFloat3 has no children
at org.apache.spark.sql.util.LanceArrowUtils$.fromArrowField(LanceArrowUtils.scala:54)
at org.apache.spark.sql.util.LanceArrowUtils$.$anonfun$fromArrowSchema$1(LanceArrowUtils.scala:86)
at scala.collection.TraversableLike.$anonfun$map$1(TraversableLike.scala:286)
at scala.collection.Iterator.foreach(Iterator.scala:943)
at scala.collection.Iterator.foreach$(Iterator.scala:943)
at scala.collection.AbstractIterator.foreach(Iterator.scala:1431)
at scala.collection.IterableLike.foreach(IterableLike.scala:74)
at scala.collection.IterableLike.foreach$(IterableLike.scala:73)
at scala.collection.AbstractIterable.foreach(Iterable.scala:56)
at scala.collection.TraversableLike.map(TraversableLike.scala:286)
at scala.collection.TraversableLike.map$(TraversableLike.scala:279)
at scala.collection.AbstractTraversable.map(Traversable.scala:108)
at org.apache.spark.sql.util.LanceArrowUtils$.fromArrowSchema(LanceArrowUtils.scala:85)
at org.apache.spark.sql.util.LanceArrowUtils.fromArrowSchema(LanceArrowUtils.scala)
at com.lancedb.lance.spark.write.AddColumnsBackfillBatchWrite$AddColumnsWriter.commit(AddColumnsBackfillBatchWrite.java:214)
at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.$anonfun$run$5(WriteToDataSourceV2Exec.scala:475)
at org.apache.spark.util.Utils$.tryWithSafeFinallyAndFailureCallbacks(Utils.scala:1397)
at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run(WriteToDataSourceV2Exec.scala:491)
at org.apache.spark.sql.execution.datasources.v2.WritingSparkTask.run$(WriteToDataSourceV2Exec.scala:430)
at org.apache.spark.sql.execution.datasources.v2.DataWritingSparkTask$.run(WriteToDataSourceV2Exec.scala:496)
at org.apache.spark.sql.execution.datasources.v2.V2TableWriteExec.$anonfun$writeWithV2$2(WriteToDataSourceV2Exec.scala:393)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:93)
at org.apache.spark.TaskContext.runTaskWithListeners(TaskContext.scala:166)
at org.apache.spark.scheduler.Task.run(Task.scala:141)Analyze
If Arrow field is:
Field { name: "randFloat3", data_type: FixedSizeList(Field { name: "element", data_type: Float32, nullable: false, dict_id: 0, dict_is_ordered: false, metadata: {} }, 3), nullable: true, dict_id: 0, dict_is_ordered: false, metadata: {"arrow.fixed-size-list.size": "3"} }
then the converted Lance field is:
Field { name: "randFloat3", id: 2, parent_id: -1, logical_type: LogicalType("fixed_size_list:float:3"), metadata: {"arrow.fixed-size-list.size": "3"}, encoding: Some(Plain), nullable: true, children: [], dictionary: None, unenforced_primary_key: false }
Metadata
Metadata
Assignees
Labels
No labels