fschutt · ronnybremer · Nov 17, 2025 · Nov 17, 2025 · Nov 18, 2025 · fschutt
diff --git a/src/cmap.rs b/src/cmap.rs
@@ -6,7 +6,7 @@ use lopdf::{Dictionary, Document, Object};
 use crate::text::CMap;
 
 /// The mapping from a CID to one or more Unicode code points.
-#[derive(Debug)]
+#[derive(Debug, Clone)]
 pub struct ToUnicodeCMap {
     pub mappings: BTreeMap<u32, Vec<u32>>,
 }
@@ -91,7 +91,7 @@ impl ToUnicodeCMap {
     }
 
     /// Generates a CMap string representation suitable for embedding in a PDF.
-    pub fn to_cmap_string(&self, font_name: &str) -> String {
+    pub fn to_cmap_string(&self, font_name: &str, single_byte_cids: bool) -> String {
         // Header section
         let mut result = format!(
             "/CIDInit /ProcSet findresource begin\n\n12 dict begin\n\nbegincmap\n\n%!PS-Adobe-3.0 \
@@ -125,7 +125,20 @@ impl ToUnicodeCMap {
             for chunk in entries.chunks(100) {
                 result.push_str(&format!("{} beginbfchar\n", chunk.len()));
                 for &(cid, unicode) in chunk {
-                    result.push_str(&format!("<{:04X}> <{:04X}>\n", cid, unicode));
+                    // force 2 byte representation for unicode values <= 0xFFFF, 4 byte otherwise
+                    if unicode <= 0xFFFF {
+                        if single_byte_cids {
+                            result.push_str(&format!("<{:02X}> <{:04X}>\n", cid, unicode));
+                        } else {
+                            result.push_str(&format!("<{:04X}> <{:04X}>\n", cid, unicode));
+                        }
+                    } else {
+                        if single_byte_cids {
+                            result.push_str(&format!("<{:02X}> <{:08X}>\n", cid, unicode));
+                        } else {
+                            result.push_str(&format!("<{:04X}> <{:08X}>\n", cid, unicode));
+                        }
+                    }
                 }
                 result.push_str("endbfchar\n");
             }
@@ -158,7 +171,27 @@ fn parse_hex_token(token: &str) -> Result<u32, String> {
 
 /// Implement the CMap trait on our ToUnicodeCMap.
 impl CMap for ToUnicodeCMap {
+    /// map single byte characters to their unicode representation
     fn map_bytes(&self, bytes: &[u8]) -> String {
+        // For simplicity, assume that the byte sequence represents single characters, each 1 byte long.
+        let mut result = String::new();
+        let mut i = 0;
+        while i < bytes.len() {
+            let cid = bytes[i] as u32;
+            if let Some(unis) = self.mappings.get(&cid) {
+                for &u in unis {
+                    if let Some(ch) = std::char::from_u32(u) {
+                        result.push(ch);
+                    }
+                }
+            }
+            i += 1;
+        }
+        result
+    }
+
+    /// map double byte characters to their unicode representation
+    fn map_bytes_u16be(&self, bytes: &[u8]) -> String {
         // For simplicity, assume that the byte sequence represents CIDs in big-endian,
         // and that each CID is 2 bytes long.
         let mut result = String::new();

diff --git a/src/deserialize.rs b/src/deserialize.rs
diff --git a/src/font.rs b/src/font.rs
@@ -304,6 +304,8 @@ impl BuiltinFont {
 pub enum FontType {
     OpenTypeCFF(Vec<u8>),
     OpenTypeCFF2,
+    ParsedEmbeddedType0(Vec<u8>),
+    ParsedEmbeddedType1C(Vec<u8>),
     #[default]
     TrueType,
 }
@@ -438,6 +440,68 @@ impl ParsedFont {
     }
 }
 
+#[derive(Clone, Default)]
+pub struct ParsedSubsetFontCustomEncoding {
+    pub base_encoding: Option<String>,
+    pub differences: Option<Vec<lopdf::Object>>,
+}
+
+#[derive(Clone, Default)]
+pub struct ParsedSubsetFontCIDSystemInfo {
+    pub ordering: Option<String>,
+    pub registry: Option<String>,
+    pub supplement: Option<i64>,
+}
+
+#[derive(Clone, Default)]
+pub struct ParsedSubsetFontDescendantFont {
+    pub base_font: Option<String>,
+    pub subtype: Option<String>,
+    pub dw: Option<i64>,
+    pub cid_to_gid_map: Option<String>,
+    pub cid_system_info: Option<ParsedSubsetFontCIDSystemInfo>,
+}
+
+#[derive(Clone, Default)]
+pub struct ParsedSubsetFontProperties {
+    pub encoding: Option<String>,
+    pub custom_encoding: Option<ParsedSubsetFontCustomEncoding>,
+    pub first_char: Option<i64>,
+    pub last_char: Option<i64>,
+    pub widths: Option<Vec<lopdf::Object>>,
+    pub base_font: Option<String>,
+    pub descendant_fonts: Option<Vec<ParsedSubsetFontDescendantFont>>,
+}
+
+#[derive(Clone, Default)]
+pub struct ParsedSubsetFontDescriptorProperties {
+    pub charset: Option<String>,
+    pub font_family: Option<String>,
+    pub font_stretch: Option<String>,
+    pub ascent: Option<i64>,
+    pub descent: Option<i64>,
+    pub cap_height: Option<i64>,
+    pub flags: Option<i64>,
+    pub italic_angle: Option<i64>,
+    pub font_weight: Option<i64>,
+    pub stemv: Option<i64>,
+    pub xheight: Option<i64>,
+    pub font_bbox: Option<Vec<lopdf::Object>>,
+    pub cid_set: Option<Vec<u8>>,
+}
+
+/// In contrast to ParsedFont this font was embedded as a subset font and therefore cannot be used to shape new text elements.
+/// When serializing the PDF this font is re-embedded as is, keeping the parsed text elements intact.
+#[derive(Clone, Default)]
+pub struct ParsedSubsetFont {
+    pub original_bytes: Vec<u8>,
+    pub font_type: FontType,
+    pub font_name: Option<String>,
+    pub cmap: Option<ToUnicodeCMap>,
+    pub font_properties: ParsedSubsetFontProperties,
+    pub font_descriptor_properties: ParsedSubsetFontDescriptorProperties,
+}
+
 pub trait PrepFont {
     fn lgi(&self, codepoint: u32) -> Option<u32>;
 
@@ -510,6 +574,19 @@ impl fmt::Debug for ParsedFont {
     }
 }
 
+impl PartialEq for ParsedSubsetFont {
+    fn eq(&self, other: &Self) -> bool {
+            self.original_bytes.len() == other.original_bytes.len()
+    }
+}
+
+impl fmt::Debug for ParsedSubsetFont {
+    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
+        f.debug_struct("ParsedSubsetFont")
+            .finish()
+    }
+}
+
 #[derive(Debug, Clone)]
 pub struct SubsetFont {
     pub bytes: Vec<u8>,
@@ -692,7 +769,7 @@ impl ParsedFont {
 
         // Create the CMap and generate its string representation
         let cmap = ToUnicodeCMap { mappings };
-        cmap.to_cmap_string(&font_id.0)
+        cmap.to_cmap_string(&font_id.0, false)
     }
 
     pub(crate) fn generate_gid_to_cid_map(&self, glyph_ids: &[(u16, char)]) -> Vec<(u16, u16)> {
@@ -1614,6 +1691,66 @@ impl ParsedFont {
     }
 }
 
+impl ParsedSubsetFont {
+    pub fn ttf_from_bytes(
+        font_bytes: &[u8],
+        _font_index: usize,
+        warnings: &mut Vec<PdfWarnMsg>,
+    ) -> Option<Self> {
+        warnings.push(PdfWarnMsg::info(
+            0,
+            0,
+            "Successfully read embedded TrueType font data".to_string(),
+        ));
+        let parsedsubset_font = ParsedSubsetFont{
+            original_bytes: font_bytes.to_vec(),
+            font_type: FontType::ParsedEmbeddedType0(font_bytes.to_vec()),
+            font_name: None,
+            cmap: None,
+            font_properties: ParsedSubsetFontProperties::default(),
+            font_descriptor_properties: ParsedSubsetFontDescriptorProperties::default(),
+        };
+        Some(parsedsubset_font)
+    }
+
+    pub fn cff_from_bytes(
+        font_bytes: &[u8],
+        _font_index: usize,
+        warnings: &mut Vec<PdfWarnMsg>,
+    ) -> Option<Self> {
+        let scope = allsorts_subset_browser::binary::read::ReadScope::new(font_bytes);
+        let cff = match scope.read::<allsorts_subset_browser::cff::CFF<'_>>() {
+            Ok(cff) => {
+                warnings.push(PdfWarnMsg::info(
+                    0,
+                    0,
+                    "Successfully read embedded CFF font data".to_string(),
+                ));
+                cff
+            }
+            Err(e) => {
+                warnings.push(PdfWarnMsg::warning(
+                    0,
+                    0,
+                    format!("Failed to read embedded CFF font data: {}", e),
+                ));
+                return None;
+            }
+        };
+        let font_name = cff.name_index.iter().next()
+            .and_then(|val| Some(String::from_utf8_lossy(val).to_string()));
+        let parsedsubset_font = ParsedSubsetFont{
+            original_bytes: font_bytes.to_vec(),
+            font_type: FontType::ParsedEmbeddedType1C(font_bytes.to_vec()),
+            font_name,
+            cmap: None,
+            font_properties: ParsedSubsetFontProperties::default(),
+            font_descriptor_properties: ParsedSubsetFontDescriptorProperties::default(),
+        };
+        Some(parsedsubset_font)
+    }
+}
+
 #[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
 #[repr(C)]
 pub struct FontMetrics {

diff --git a/src/lib.rs b/src/lib.rs
@@ -449,6 +449,9 @@ pub struct PdfResources {
     /// Fonts found in the PDF file, indexed by the sha256 of their contents
     #[serde(default)]
     pub fonts: PdfFontMap,
+    /// Fonts found embedded as subset fonts in the PDF file, indexed by the sha256 of their contents
+    #[serde(skip)]
+    pub subsetfonts: PdfSubsetFontMap,
     /// XObjects (forms, images, embedded PDF contents, etc.)
     #[serde(default)]
     pub xobjects: XObjectMap,
@@ -487,6 +490,11 @@ pub struct PdfFontMap {
     pub map: BTreeMap<FontId, ParsedFont>,
 }
 
+#[derive(Debug, PartialEq, Default, Clone)]
+pub struct PdfSubsetFontMap {
+    pub map: BTreeMap<FontId, ParsedSubsetFont>,
+}
+
 #[derive(Debug, PartialEq, Default, Clone)]
 pub struct ParsedIccProfile {}