Skip to content
Draft
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 36 additions & 3 deletions src/cmap.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ use lopdf::{Dictionary, Document, Object};
use crate::text::CMap;

/// The mapping from a CID to one or more Unicode code points.
#[derive(Debug)]
#[derive(Debug, Clone)]
pub struct ToUnicodeCMap {
pub mappings: BTreeMap<u32, Vec<u32>>,
}
Expand Down Expand Up @@ -91,7 +91,7 @@ impl ToUnicodeCMap {
}

/// Generates a CMap string representation suitable for embedding in a PDF.
pub fn to_cmap_string(&self, font_name: &str) -> String {
pub fn to_cmap_string(&self, font_name: &str, single_byte_cids: bool) -> String {
// Header section
let mut result = format!(
"/CIDInit /ProcSet findresource begin\n\n12 dict begin\n\nbegincmap\n\n%!PS-Adobe-3.0 \
Expand Down Expand Up @@ -125,7 +125,20 @@ impl ToUnicodeCMap {
for chunk in entries.chunks(100) {
result.push_str(&format!("{} beginbfchar\n", chunk.len()));
for &(cid, unicode) in chunk {
result.push_str(&format!("<{:04X}> <{:04X}>\n", cid, unicode));
// force 2 byte representation for unicode values <= 0xFFFF, 4 byte otherwise
if unicode <= 0xFFFF {
if single_byte_cids {
result.push_str(&format!("<{:02X}> <{:04X}>\n", cid, unicode));
} else {
result.push_str(&format!("<{:04X}> <{:04X}>\n", cid, unicode));
}
} else {
if single_byte_cids {
result.push_str(&format!("<{:02X}> <{:08X}>\n", cid, unicode));
} else {
result.push_str(&format!("<{:04X}> <{:08X}>\n", cid, unicode));
}
}
}
result.push_str("endbfchar\n");
}
Expand Down Expand Up @@ -158,7 +171,27 @@ fn parse_hex_token(token: &str) -> Result<u32, String> {

/// Implement the CMap trait on our ToUnicodeCMap.
impl CMap for ToUnicodeCMap {
/// map single byte characters to their unicode representation
fn map_bytes(&self, bytes: &[u8]) -> String {
// For simplicity, assume that the byte sequence represents single characters, each 1 byte long.
let mut result = String::new();
let mut i = 0;
while i < bytes.len() {
let cid = bytes[i] as u32;
if let Some(unis) = self.mappings.get(&cid) {
for &u in unis {
if let Some(ch) = std::char::from_u32(u) {
result.push(ch);
}
}
}
i += 1;
}
result
}

/// map double byte characters to their unicode representation
fn map_bytes_u16be(&self, bytes: &[u8]) -> String {
// For simplicity, assume that the byte sequence represents CIDs in big-endian,
// and that each CID is 2 bytes long.
let mut result = String::new();
Expand Down
531 changes: 505 additions & 26 deletions src/deserialize.rs

Large diffs are not rendered by default.

139 changes: 138 additions & 1 deletion src/font.rs
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ impl BuiltinFont {
pub enum FontType {
OpenTypeCFF(Vec<u8>),
OpenTypeCFF2,
ParsedEmbeddedType0(Vec<u8>),
ParsedEmbeddedType1C(Vec<u8>),
#[default]
TrueType,
}
Expand Down Expand Up @@ -438,6 +440,68 @@ impl ParsedFont {
}
}

#[derive(Clone, Default)]
pub struct ParsedSubsetFontCustomEncoding {
pub base_encoding: Option<String>,
pub differences: Option<Vec<lopdf::Object>>,
}

#[derive(Clone, Default)]
pub struct ParsedSubsetFontCIDSystemInfo {
pub ordering: Option<String>,
pub registry: Option<String>,
pub supplement: Option<i64>,
}

#[derive(Clone, Default)]
pub struct ParsedSubsetFontDescendantFont {
pub base_font: Option<String>,
pub subtype: Option<String>,
pub dw: Option<i64>,
pub cid_to_gid_map: Option<String>,
pub cid_system_info: Option<ParsedSubsetFontCIDSystemInfo>,
}

#[derive(Clone, Default)]
pub struct ParsedSubsetFontProperties {
pub encoding: Option<String>,
pub custom_encoding: Option<ParsedSubsetFontCustomEncoding>,
pub first_char: Option<i64>,
pub last_char: Option<i64>,
pub widths: Option<Vec<lopdf::Object>>,
pub base_font: Option<String>,
pub descendant_fonts: Option<Vec<ParsedSubsetFontDescendantFont>>,
}

#[derive(Clone, Default)]
pub struct ParsedSubsetFontDescriptorProperties {
pub charset: Option<String>,
pub font_family: Option<String>,
pub font_stretch: Option<String>,
pub ascent: Option<i64>,
pub descent: Option<i64>,
pub cap_height: Option<i64>,
pub flags: Option<i64>,
pub italic_angle: Option<i64>,
pub font_weight: Option<i64>,
pub stemv: Option<i64>,
pub xheight: Option<i64>,
pub font_bbox: Option<Vec<lopdf::Object>>,
pub cid_set: Option<Vec<u8>>,
}

/// In contrast to ParsedFont this font was embedded as a subset font and therefore cannot be used to shape new text elements.
/// When serializing the PDF this font is re-embedded as is, keeping the parsed text elements intact.
#[derive(Clone, Default)]
pub struct ParsedSubsetFont {
pub original_bytes: Vec<u8>,
pub font_type: FontType,
pub font_name: Option<String>,
pub cmap: Option<ToUnicodeCMap>,
pub font_properties: ParsedSubsetFontProperties,
pub font_descriptor_properties: ParsedSubsetFontDescriptorProperties,
}

pub trait PrepFont {
fn lgi(&self, codepoint: u32) -> Option<u32>;

Expand Down Expand Up @@ -510,6 +574,19 @@ impl fmt::Debug for ParsedFont {
}
}

impl PartialEq for ParsedSubsetFont {
fn eq(&self, other: &Self) -> bool {
self.original_bytes.len() == other.original_bytes.len()
}
}

impl fmt::Debug for ParsedSubsetFont {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
f.debug_struct("ParsedSubsetFont")
.finish()
}
}

#[derive(Debug, Clone)]
pub struct SubsetFont {
pub bytes: Vec<u8>,
Expand Down Expand Up @@ -692,7 +769,7 @@ impl ParsedFont {

// Create the CMap and generate its string representation
let cmap = ToUnicodeCMap { mappings };
cmap.to_cmap_string(&font_id.0)
cmap.to_cmap_string(&font_id.0, false)
}

pub(crate) fn generate_gid_to_cid_map(&self, glyph_ids: &[(u16, char)]) -> Vec<(u16, u16)> {
Expand Down Expand Up @@ -1614,6 +1691,66 @@ impl ParsedFont {
}
}

impl ParsedSubsetFont {
pub fn ttf_from_bytes(
font_bytes: &[u8],
_font_index: usize,
warnings: &mut Vec<PdfWarnMsg>,
) -> Option<Self> {
warnings.push(PdfWarnMsg::info(
0,
0,
"Successfully read embedded TrueType font data".to_string(),
));
let parsedsubset_font = ParsedSubsetFont{
original_bytes: font_bytes.to_vec(),
font_type: FontType::ParsedEmbeddedType0(font_bytes.to_vec()),
font_name: None,
cmap: None,
font_properties: ParsedSubsetFontProperties::default(),
font_descriptor_properties: ParsedSubsetFontDescriptorProperties::default(),
};
Some(parsedsubset_font)
}

pub fn cff_from_bytes(
font_bytes: &[u8],
_font_index: usize,
warnings: &mut Vec<PdfWarnMsg>,
) -> Option<Self> {
let scope = allsorts_subset_browser::binary::read::ReadScope::new(font_bytes);
let cff = match scope.read::<allsorts_subset_browser::cff::CFF<'_>>() {
Ok(cff) => {
warnings.push(PdfWarnMsg::info(
0,
0,
"Successfully read embedded CFF font data".to_string(),
));
cff
}
Err(e) => {
warnings.push(PdfWarnMsg::warning(
0,
0,
format!("Failed to read embedded CFF font data: {}", e),
));
return None;
}
};
let font_name = cff.name_index.iter().next()
.and_then(|val| Some(String::from_utf8_lossy(val).to_string()));
let parsedsubset_font = ParsedSubsetFont{
original_bytes: font_bytes.to_vec(),
font_type: FontType::ParsedEmbeddedType1C(font_bytes.to_vec()),
font_name,
cmap: None,
font_properties: ParsedSubsetFontProperties::default(),
font_descriptor_properties: ParsedSubsetFontDescriptorProperties::default(),
};
Some(parsedsubset_font)
}
}

#[derive(Debug, Copy, Clone, PartialEq, Eq, PartialOrd, Ord, Hash)]
#[repr(C)]
pub struct FontMetrics {
Expand Down
8 changes: 8 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -449,6 +449,9 @@ pub struct PdfResources {
/// Fonts found in the PDF file, indexed by the sha256 of their contents
#[serde(default)]
pub fonts: PdfFontMap,
/// Fonts found embedded as subset fonts in the PDF file, indexed by the sha256 of their contents
#[serde(skip)]
pub subsetfonts: PdfSubsetFontMap,
/// XObjects (forms, images, embedded PDF contents, etc.)
#[serde(default)]
pub xobjects: XObjectMap,
Expand Down Expand Up @@ -487,6 +490,11 @@ pub struct PdfFontMap {
pub map: BTreeMap<FontId, ParsedFont>,
}

#[derive(Debug, PartialEq, Default, Clone)]
pub struct PdfSubsetFontMap {
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

there is already a SubsetFont

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, which is used after the font is parsed and loaded from the PDF through allsorts. However, since the CFF fonts don't load because of missing table entries it wasn't utilized. I left this code path unchanged, as it might serve a different purpose. If not, they should be combined.

pub map: BTreeMap<FontId, ParsedSubsetFont>,
}

#[derive(Debug, PartialEq, Default, Clone)]
pub struct ParsedIccProfile {}

Expand Down
Loading
Loading