Martin
09/04/2023, 1:43 PMdf = (
spark.range(10).toDF("id")
.withColumn("some_string", F.col("id").cast("string"))
.withMetadata("some_string", {"derived": True, "source": ["id"]})
)
df.write.format("delta").save(tmp_path)
_Delta log_:
{"commitInfo":{"timestamp":1693832260632,"operation":"WRITE","operationParameters":{"mode":"ErrorIfExists","partitionBy":"[]"},"isolationLevel":"Serializable","isBlindAppend":true,"operationMetrics":{"numFiles":"1","numOutputRows":"10","numOutputBytes":"840"},"engineInfo":"Apache-Spark/3.3.1.5.2-92314920 Delta-Lake/2.2.0.6","txnId":"028ec17c-1bd1-471c-8cce-b0e80b23d9b3"}}
{"protocol":{"minReaderVersion":1,"minWriterVersion":2}}
{"metaData":{"id":"eacf7444-66e2-4997-a068-8ee796f14efd","format":{"provider":"parquet","options":{}},"schemaString":"{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"some_string\",\"type\":\"string\",\"nullable\":true,\"metadata\":{\"derived\":true,\"source\":[\"id\"]}}]}","partitionColumns":[],"configuration":{},"createdTime":1693832259833}}
{"add":{"path":"part-00000-23a6105e-abe8-4048-9359-257395360e36-c000.snappy.parquet","partitionValues":{},"size":840,"modificationTime":1693832260476,"dataChange":true,"stats":"{\"numRecords\":10,\"minValues\":{\"id\":0,\"some_string\":\"0\"},\"maxValues\":{\"id\":9,\"some_string\":\"9\"},\"nullCount\":{\"id\":0,\"some_string\":0}}","tags":{}}}