feat: typlite supports docx export (#1727)

* feat: docx impl

Revert "build: remove other cargo deps"

This reverts commit 059b480b1f635d1e8f5be7c191075efcf959b40b.

feat(typlite): Docx export and export markdown in cmark-writer (#1698)

* feat: docx export support

* refactor: simplify DocxConverter structure and improve content handling

* tests: add binary insta for docx

* feat: add MathBlock style and improve frame rendering in DocxConverter

* fix: enhance paragraph creation(silly method)

* fix: enhance math equation rendering

* use md5 instead of docx binary

* feat: enhance list numbering and paragraph handling in DocxConverter

* Refactor DOCX converter to improve list handling and document structure

- Introduced separate methods for creating ordered and unordered list numbering.
- Enhanced list management by tracking next numbering IDs.
- Consolidated paragraph and run management within the DocxConverter.
- Improved image processing with better error handling and placeholder support.
- Streamlined the handling of various HTML elements, including headings, lists, and images.
- Added functionality for processing captions and preformatted blocks.
- Updated methods for processing inline styles and links.

* feat: update cmark-writer to version 0.2.0

* feat: refactor code block handling in DOCX converter for improved readability

* refactor: refactor DOCX converter to enhance document structure

* refactor docx to separated files

* chore: update cmark-writer version to 0.3.0

* fix: ol custom value

* feat: table and grid processing

* use cmark-writer's ast node for consistency

* fix: update snapshot hashes for document generation tests

* fix: add preamble

* update snapshot hashes

* refactor DOCX conversion: Split writer functionality into separate module, enhance image processing, and clean up utility functions

* update comments in LaTeX and Markdown converters for clarity and consistency

* fmt

* delete utils

* feat: support figure node by custom node in cmark-writer

* fix

* fix: frame

* feat: enhance table conversion logic in MarkdownConverter

* refactor: simplify FigureNode implementation by removing CustomNode trait

* chore: update cmark-writer to version 0.5.0

* fix: update figure and raw inline snapshots for consistency

* fix: update snapshot hashes and correct caption reference in markdown.typ

* refactor proj structure

* feat: update CompileArgs to support multiple output files and remove debug option

* docs: update README to clarify usage of multiple output formats and comment out feature section

* remove DocxConverter module

* fix: update snapshots for figure caption, list, outline, and docx generation

* update tests

Co-Authored-By: Hong Jiarong <me@jrhim.com>

* test: docx snapshots

* fix: use old resvg

* feat: make docx opt-in

* fix: image process on our hand

* dev: remove support to rarely used image formats

* feat: use new base64

* test: update snapshot

* fix: dim calc

* fix: dim calc 2

* test: update snapshot

---------

Co-authored-by: Hong Jiarong <me@jrhim.com>
This commit is contained in:
Myriad-Dreamin 2025-05-01 17:23:46 +08:00 committed by GitHub
parent 3ba3211d1a
commit 6b7ca47f23
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 1450 additions and 6 deletions

34
Cargo.lock generated
View file

@ -1034,6 +1034,19 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "docx-rs"
version = "0.4.18-rc19"
source = "git+https://github.com/Myriad-Dreamin/docx-rs?rev=db49a729f68dbdb9e8e91857fbb1c3d414209871#db49a729f68dbdb9e8e91857fbb1c3d414209871"
dependencies = [
"base64",
"serde",
"serde_json",
"thiserror 1.0.69",
"xml-rs",
"zip",
]
[[package]]
name = "downcast-rs"
version = "1.2.1"
@ -4765,9 +4778,12 @@ dependencies = [
"clap",
"cmark-writer",
"comemo",
"docx-rs",
"ecow",
"image",
"insta",
"regex",
"resvg",
"tinymist-analysis",
"tinymist-derive",
"tinymist-project",
@ -5907,6 +5923,12 @@ dependencies = [
"rustix",
]
[[package]]
name = "xml-rs"
version = "0.8.26"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a62ce76d9b56901b19a74f19431b0d8b3bc7ca4ad685a746dfd78ca8f4fc6bda"
[[package]]
name = "xmlparser"
version = "0.13.6"
@ -6071,6 +6093,18 @@ dependencies = [
"syn 2.0.100",
]
[[package]]
name = "zip"
version = "0.6.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "760394e246e4c28189f19d488c058bf16f564016aefac5d32bb1f3b51d5e9261"
dependencies = [
"byteorder",
"crc32fast",
"crossbeam-utils",
"flate2",
]
[[package]]
name = "zune-core"
version = "0.4.12"

View file

@ -33,7 +33,16 @@ typst.workspace = true
typst-svg.workspace = true
typst-syntax.workspace = true
typst-html.workspace = true
regex.workspace = true
cmark-writer = { version = "0.6.3", features = ["gfm"] }
docx-rs = { git = "https://github.com/Myriad-Dreamin/docx-rs", default-features = false, rev = "db49a729f68dbdb9e8e91857fbb1c3d414209871", optional = true }
# typst can only support these formats.
image = { version = "0.25.6", default-features = false, features = [
"png",
"jpeg",
"gif",
], optional = true }
resvg = { version = "0.43.0", optional = true }
[dev-dependencies]
insta.workspace = true
@ -41,10 +50,11 @@ regex.workspace = true
tinymist-tests.workspace = true
[features]
default = ["cli", "embed-fonts", "no-content-hint"]
default = ["cli", "embed-fonts", "no-content-hint", "docx"]
clap = ["dep:clap"]
cli = ["clap", "clap/wrap_help"]
no-content-hint = ["tinymist-project/no-content-hint"]
docx = ["docx-rs", "image", "resvg"]
# Embeds Typst's default fonts for
# - text (Linux Libertine),

View file

@ -19,6 +19,7 @@ pub enum ListState {
pub enum Format {
Md,
LaTeX,
#[cfg(feature = "docx")]
Docx,
}

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/base.typ
---
siphash128_13:f242a739ddf7cdce8041455cd09bf221

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/enum.typ
---
siphash128_13:120c2e9245d767d648fd52a8564c9efc

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/enum2.typ
---
siphash128_13:120c2e9245d767d648fd52a8564c9efc

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/figure_caption.typ
---
siphash128_13:17d544d88231b74b1119c35f627026b

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/figure_image.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/figure_image_alt.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/image.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/image_alt.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/link.typ
---
siphash128_13:35e614ded7c81c7fb6781d77872add56

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/link2.typ
---
siphash128_13:2374bfc8248e276ed1549f5d6a8b4a40

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/link3.typ
---
siphash128_13:5d5f436195b9b0b0f206881bc4d810f8

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/list.typ
---
siphash128_13:dd68d2d40ddf137ad77719e71c56a19e

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/math_block.typ
---
siphash128_13:ca4f0e6c5b2afee90d9736cb2d3bd6ba

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/math_block2.typ
---
siphash128_13:1c9f3489f7742ef572998ff2b4fd5abd

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/math_inline.typ
---
siphash128_13:2ac3d241b41c4ee23a122b73e43c8063

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/outline.typ
---
siphash128_13:549cf83e9b77d8ae061c95ceb4f93ef6

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/raw_inline.typ
---
siphash128_13:fe468826fde99ac8a0e77767d4045199

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/table.typ
---
siphash128_13:ce1b6f668016a12edf304ab7f38aea42

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/base.typ
---
siphash128_13:f242a739ddf7cdce8041455cd09bf221

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/enum.typ
---
siphash128_13:120c2e9245d767d648fd52a8564c9efc

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/enum2.typ
---
siphash128_13:120c2e9245d767d648fd52a8564c9efc

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/figure_caption.typ
---
siphash128_13:17d544d88231b74b1119c35f627026b

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/figure_image.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/figure_image_alt.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/image.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/image_alt.typ
---
siphash128_13:89ee713812f00bde9ac174f72c81760

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/link.typ
---
siphash128_13:35e614ded7c81c7fb6781d77872add56

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/link2.typ
---
siphash128_13:2374bfc8248e276ed1549f5d6a8b4a40

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/link3.typ
---
siphash128_13:5d5f436195b9b0b0f206881bc4d810f8

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/list.typ
---
siphash128_13:dd68d2d40ddf137ad77719e71c56a19e

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/math_block.typ
---
siphash128_13:ca4f0e6c5b2afee90d9736cb2d3bd6ba

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/math_block2.typ
---
siphash128_13:1c9f3489f7742ef572998ff2b4fd5abd

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/math_inline.typ
---
siphash128_13:2ac3d241b41c4ee23a122b73e43c8063

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/outline.typ
---
siphash128_13:549cf83e9b77d8ae061c95ceb4f93ef6

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/raw_inline.typ
---
siphash128_13:fe468826fde99ac8a0e77767d4045199

View file

@ -0,0 +1,6 @@
---
source: crates/typlite/src/tests.rs
expression: hash
input_file: crates/typlite/src/fixtures/integration/table.typ
---
siphash128_13:ce1b6f668016a12edf304ab7f38aea42

View file

@ -97,6 +97,15 @@ impl MarkdownDocument {
Ok(output)
}
/// Convert the content to a DOCX document
#[cfg(feature = "docx")]
pub fn to_docx(&self) -> Result<Vec<u8>> {
let ast = self.parse()?;
let mut writer = WriterFactory::create(Format::Docx);
writer.write_vec(&ast)
}
}
/// A color theme for rendering the content. The valid values can be checked in [color-scheme](https://developer.mozilla.org/en-US/docs/Web/CSS/color-scheme).
@ -171,10 +180,20 @@ impl Typlite {
match self.format {
Format::Md => self.convert_doc()?.to_md_string(),
Format::LaTeX => self.convert_doc()?.to_tex_string(true),
_ => Err("format is not supported".into()),
#[cfg(feature = "docx")]
Format::Docx => Err("docx format is not supported".into()),
}
}
/// Convert the content to a DOCX document
#[cfg(feature = "docx")]
pub fn to_docx(self) -> Result<Vec<u8>> {
if self.format != Format::Docx {
return Err("format is not DOCX".into());
}
self.convert_doc()?.to_docx()
}
/// Convert the content to a markdown document.
pub fn convert_doc(self) -> Result<MarkdownDocument> {
let entry = self.world.entry_state();

View file

@ -85,6 +85,7 @@ fn main() -> typlite::Result<()> {
Some(output) if output.extension() == Some(std::ffi::OsStr::new("tex")) => {
Format::LaTeX
}
#[cfg(feature = "docx")]
Some(output) if output.extension() == Some(std::ffi::OsStr::new("docx")) => {
Format::Docx
}
@ -92,7 +93,30 @@ fn main() -> typlite::Result<()> {
};
match format {
Format::Docx => todo!(),
#[cfg(feature = "docx")]
Format::Docx => {
let docx_data = match doc.to_docx() {
Ok(data) => data,
Err(err) => {
eprintln!("Error generating DOCX for {}: {}", output_path, err);
continue;
}
};
match output {
None => {
eprintln!("output file is required for DOCX format");
continue;
}
Some(output) => {
if let Err(err) = std::fs::write(&output, docx_data) {
eprintln!("failed to write DOCX file {}: {}", output.display(), err);
continue;
}
println!("Generated DOCX file: {}", output.display());
}
}
}
Format::LaTeX => {
let result = doc.to_tex_string(true);
match (result, output) {

View file

@ -33,6 +33,42 @@ fn convert_docs() {
});
}
#[test]
#[cfg(feature = "docx")]
fn test_docx_generation() {
snapshot_testing("integration", &|world, _path| {
let converter = Typlite::new(Arc::new(world.clone()))
.with_feature(TypliteFeat {
..Default::default()
})
.with_format(Format::Docx);
let docx_data = match converter.to_docx() {
Ok(data) => data,
Err(err) => {
panic!("Failed to generate DOCX: {}", err);
}
};
assert!(!docx_data.is_empty(), "DOCX data should not be empty");
assert_eq!(
&docx_data[0..2],
&[0x50, 0x4B],
"DOCX data should start with PK signature"
);
// insta::assert_binary_snapshot!("test_output.docx", docx_data);
let hash = format!(
"siphash128_13:{:016x}",
tinymist_std::hash::hash128(&docx_data)
);
insta::assert_snapshot!("docx_generation_hash", hash);
// insta::assert_binary_snapshot!(".docx", docx_data);
});
}
enum ConvKind {
Md { for_docs: bool },
LaTeX,

View file

@ -0,0 +1,210 @@
//! Image processing functionality for DOCX conversion
use base64::Engine;
use docx_rs::*;
use std::io::Cursor;
use crate::Result;
/// Image processor for DOCX documents
pub struct DocxImageProcessor;
impl DocxImageProcessor {
/// Create a new image processor
pub fn new() -> Self {
Self
}
/// Convert SVG data to PNG format
pub fn convert_svg_to_png(&self, svg_data: &[u8]) -> Result<Vec<u8>> {
// Check if data is valid SVG
let svg_str = match std::str::from_utf8(svg_data) {
Ok(s) => s,
Err(_) => return Err("Unable to parse input data as UTF-8 string".into()),
};
let dpi = 300.0;
let scale_factor = dpi / 96.0;
let opt = resvg::usvg::Options {
dpi,
..resvg::usvg::Options::default()
};
// Parse SVG
let rtree = match resvg::usvg::Tree::from_str(svg_str, &opt) {
Ok(tree) => tree,
Err(e) => return Err(format!("SVG parsing error: {:?}", e).into()),
};
let size = rtree.size().to_int_size();
let width = (size.width() as f32 * scale_factor) as u32;
let height = (size.height() as f32 * scale_factor) as u32;
// Create pixel buffer
let mut pixmap = match resvg::tiny_skia::Pixmap::new(width, height) {
Some(pixmap) => pixmap,
None => return Err("Unable to create pixel buffer".into()),
};
// Render SVG to pixel buffer
resvg::render(
&rtree,
resvg::tiny_skia::Transform::from_scale(scale_factor, scale_factor),
&mut pixmap.as_mut(),
);
// Encode as PNG
pixmap
.encode_png()
.map_err(|e| format!("PNG encoding error: {:?}", e).into())
}
/// Process image data and add to document
pub fn process_image_data(
&self,
docx: Docx,
data: &[u8],
alt_text: Option<&str>,
scale: Option<f32>,
) -> Docx {
// Add image format validation
match image::guess_format(data) {
Ok(..) => {
// Process image data
// For other formats, try to convert to PNG
let pic = match image::load_from_memory(data) {
Ok(img) => {
let (w, h) =
Self::image_dim(::image::GenericImageView::dimensions(&img), scale);
let mut buffer = Vec::new();
if img
.write_to(&mut Cursor::new(&mut buffer), image::ImageFormat::Png)
.is_ok()
{
Pic::new_with_dimensions(buffer, w, h)
} else {
// If conversion fails, return original document (without image)
let err_para = Paragraph::new().add_run(Run::new().add_text(
"[Image processing error: Unable to convert to supported format]".to_string(),
));
return docx.add_paragraph(err_para);
}
}
Err(_) => {
// If unable to load image, return original document (without image)
let err_para = Paragraph::new().add_run(Run::new().add_text(
"[Image processing error: Unable to load image]".to_string(),
));
return docx.add_paragraph(err_para);
}
};
let img_para = Paragraph::new().add_run(Run::new().add_image(pic));
let doc_with_img = docx.add_paragraph(img_para);
if let Some(alt) = alt_text {
if !alt.is_empty() {
let caption_para = Paragraph::new()
.style("Caption")
.add_run(Run::new().add_text(alt));
doc_with_img.add_paragraph(caption_para)
} else {
doc_with_img
}
} else {
doc_with_img
}
}
Err(_) => {
// If unable to determine image format, return original document (without image)
let err_para =
Paragraph::new()
.add_run(Run::new().add_text(
"[Image processing error: Unknown image format]".to_string(),
));
docx.add_paragraph(err_para)
}
}
}
/// Process inline image and add to Run
pub fn process_inline_image(&self, mut run: Run, data: &[u8]) -> Result<Run> {
match image::guess_format(data) {
Ok(..) => {
// Try to convert to PNG
let pic = match image::load_from_memory(data) {
Ok(img) => {
let (w, h) = ::image::GenericImageView::dimensions(&img);
let mut buffer = Vec::new();
if img
.write_to(&mut Cursor::new(&mut buffer), image::ImageFormat::Png)
.is_ok()
{
Pic::new_with_dimensions(buffer, w, h)
} else {
run = run.add_text("[Image conversion error]");
return Ok(run);
}
}
Err(_) => {
run = run.add_text("[Image loading error]");
return Ok(run);
}
};
run = run.add_image(pic);
Ok(run)
}
Err(_) => {
run = run.add_text("[Unknown image format]");
Ok(run)
}
}
}
/// Process data URL inline image
pub fn process_data_url_image(&self, run: Run, src: &str, is_typst_block: bool) -> Result<Run> {
if let Some(data_start) = src.find("base64,") {
let base64_data = &src[data_start + 7..];
if let Ok(img_data) = base64::engine::general_purpose::STANDARD.decode(base64_data) {
// If it's a typst-block (SVG data), special handling is needed
if is_typst_block {
// Use resvg to convert SVG to PNG
if let Ok(png_data) = self.convert_svg_to_png(&img_data) {
let mut new_run = run;
new_run = self.process_inline_image(new_run, &png_data)?;
return Ok(new_run);
} else {
return Ok(run.add_text("[SVG conversion failed]"));
}
} else {
// Normal image processing
let mut new_run = run;
new_run = self.process_inline_image(new_run, &img_data)?;
return Ok(new_run);
}
}
}
Ok(run.add_text("[Invalid data URL]"))
}
/// Calculate image dimensions for DOCX
pub fn image_dim((w, h): (u32, u32), scale_factor: Option<f32>) -> (u32, u32) {
let actual_scale = scale_factor.unwrap_or(1.0);
let max_width = 5486400;
let scaled_w = (w as f32 * actual_scale) as u32;
let scaled_h = (h as f32 * actual_scale) as u32;
if scaled_w > max_width {
let ratio = scaled_h as f32 / scaled_w as f32;
let new_width = max_width;
let new_height = (max_width as f32 * ratio) as u32;
(new_width, new_height)
} else {
(scaled_w, scaled_h)
}
}
}

View file

@ -0,0 +1,14 @@
//! DOCX converter implementation using docx-rs
//!
//! This module is organized into several main components:
//! - Writer: Functionality for rendering intermediate DocxNode structure to DOCX format
//! - Styles: Document style management
//! - Numbering: List numbering management
//! - Node structures: DocxNode and DocxInline representing document structure
mod image_processor;
mod numbering;
mod styles;
mod writer;
pub use writer::DocxWriter;

View file

@ -0,0 +1,126 @@
//! List numbering management for DOCX conversion
use docx_rs::*;
/// List numbering management for DOCX
#[derive(Clone, Debug)]
pub struct DocxNumbering {
initialized: bool,
next_id: usize,
}
impl DocxNumbering {
/// Create a new numbering manager
pub fn new() -> Self {
Self {
initialized: false,
next_id: 1,
}
}
/// Create a list level with the specified parameters
pub fn create_list_level(id: usize, format: &str, text: &str, is_bullet: bool) -> Level {
let indent_size = 720 * (id + 1) as i32;
let hanging_indent = if is_bullet { 360 } else { 420 };
Level::new(
id,
Start::new(1),
NumberFormat::new(format),
LevelText::new(text),
LevelJc::new("left"),
)
.indent(
Some(indent_size),
Some(SpecialIndentType::Hanging(hanging_indent)),
None,
None,
)
}
/// Initialize the numbering manager
pub fn initialize_numbering(&mut self, docx: Docx) -> Docx {
if self.initialized {
return docx;
}
self.initialized = true;
docx
}
/// Create a new ordered list numbering, including a new AbstractNumbering instance
pub fn create_ordered_numbering(&mut self, docx: Docx) -> (Docx, usize) {
let abstract_id = self.next_id;
let numbering_id = self.next_id;
self.next_id += 1;
let mut ordered_abstract = AbstractNumbering::new(abstract_id);
for i in 0..9 {
let level_text = match i {
0 => "%1.",
1 => "%2.",
2 => "%3.",
3 => "%4.",
4 => "%5.",
5 => "%6.",
_ => "%7.",
};
let number_format = match i {
0 => "decimal",
1 => "lowerLetter",
2 => "lowerRoman",
3 => "upperRoman",
4 => "decimal",
5 => "lowerLetter",
_ => "decimal",
};
let mut ordered_level = Self::create_list_level(i, number_format, level_text, false);
if i > 0 {
ordered_level = ordered_level.level_restart(0_u32);
}
ordered_abstract = ordered_abstract.add_level(ordered_level);
}
let docx = docx
.add_abstract_numbering(ordered_abstract)
.add_numbering(Numbering::new(numbering_id, abstract_id));
(docx, numbering_id)
}
/// Create a new unordered list numbering, including a new AbstractNumbering instance
pub fn create_unordered_numbering(&mut self, docx: Docx) -> (Docx, usize) {
let abstract_id = self.next_id;
let numbering_id = self.next_id;
self.next_id += 1;
// Create AbstractNumbering for unordered list
let mut unordered_abstract = AbstractNumbering::new(abstract_id);
// Add 9 levels of definition
for i in 0..9 {
let bullet_text = match i {
0 => "",
1 => "",
2 => "",
3 => "",
4 => "",
_ => "",
};
let unordered_level = Self::create_list_level(i, "bullet", bullet_text, true);
unordered_abstract = unordered_abstract.add_level(unordered_level);
}
let docx = docx
.add_abstract_numbering(unordered_abstract)
.add_numbering(Numbering::new(numbering_id, abstract_id));
(docx, numbering_id)
}
}

View file

@ -0,0 +1,107 @@
//! Document style management for DOCX conversion
use docx_rs::*;
/// Document style management
#[derive(Clone, Debug)]
pub struct DocxStyles {
initialized: bool,
}
impl DocxStyles {
/// Create a new style manager
pub fn new() -> Self {
Self { initialized: false }
}
/// Create a heading style with the specified parameters
fn create_heading_style(name: &str, display_name: &str, size: usize) -> Style {
Style::new(name, StyleType::Paragraph)
.name(display_name)
.size(size)
.bold()
}
/// Initialize all document styles
pub fn initialize_styles(&self, docx: Docx) -> Docx {
if self.initialized {
return docx;
}
let heading1 = Self::create_heading_style("Heading1", "Heading 1", 32);
let heading2 = Self::create_heading_style("Heading2", "Heading 2", 28);
let heading3 = Self::create_heading_style("Heading3", "Heading 3", 26);
let heading4 = Self::create_heading_style("Heading4", "Heading 4", 24);
let heading5 = Self::create_heading_style("Heading5", "Heading 5", 22);
let heading6 = Self::create_heading_style("Heading6", "Heading 6", 20);
let courier_fonts = RunFonts::new()
.ascii("Courier New")
.hi_ansi("Courier New")
.east_asia("Courier New")
.cs("Courier New");
let code_block = Style::new("CodeBlock", StyleType::Paragraph)
.name("Code Block")
.fonts(courier_fonts.clone())
.size(18);
let code_inline = Style::new("CodeInline", StyleType::Character)
.name("Code Inline")
.fonts(courier_fonts)
.size(18);
let math_block = Style::new("MathBlock", StyleType::Paragraph)
.name("Math Block")
.align(AlignmentType::Center);
let emphasis = Style::new("Emphasis", StyleType::Character)
.name("Emphasis")
.italic();
let strong = Style::new("Strong", StyleType::Character)
.name("Strong")
.bold();
let highlight = Style::new("Highlight", StyleType::Character)
.name("Highlight")
.highlight("yellow");
let hyperlink = Style::new("Hyperlink", StyleType::Character)
.name("Hyperlink")
.color("0000FF")
.underline("single");
let blockquote = Style::new("Blockquote", StyleType::Paragraph)
.name("Block Quote")
.indent(Some(720), None, None, None)
.italic();
let caption = Style::new("Caption", StyleType::Paragraph)
.name("Caption")
.italic()
.size(16)
.align(AlignmentType::Center);
let table = Style::new("Table", StyleType::Table)
.name("Table")
.table_align(TableAlignmentType::Center);
docx.add_style(heading1)
.add_style(heading2)
.add_style(heading3)
.add_style(heading4)
.add_style(heading5)
.add_style(heading6)
.add_style(code_block)
.add_style(code_inline)
.add_style(math_block)
.add_style(emphasis)
.add_style(strong)
.add_style(highlight)
.add_style(hyperlink)
.add_style(blockquote)
.add_style(caption)
.add_style(table)
}
}

View file

@ -0,0 +1,644 @@
//! DOCX document writer implementation
use base64::Engine;
use cmark_writer::ast::{ListItem, Node};
use docx_rs::*;
use ecow::EcoString;
use std::fs;
use std::io::Cursor;
use crate::common::{FigureNode, FormatWriter};
use crate::Result;
use super::image_processor::DocxImageProcessor;
use super::numbering::DocxNumbering;
use super::styles::DocxStyles;
/// DOCX writer that generates DOCX directly from AST (without intermediate representation)
pub struct DocxWriter {
styles: DocxStyles,
numbering: DocxNumbering,
list_level: usize,
list_numbering_count: usize,
image_processor: DocxImageProcessor,
}
impl Default for DocxWriter {
fn default() -> Self {
Self::new()
}
}
impl DocxWriter {
pub fn new() -> Self {
Self {
styles: DocxStyles::new(),
numbering: DocxNumbering::new(),
list_level: 0,
list_numbering_count: 0,
image_processor: DocxImageProcessor::new(),
}
}
/// Process image node
fn process_image(&self, docx: Docx, url: &str, alt_nodes: &[Node]) -> Result<Docx> {
// Build alt text
let alt_text = if !alt_nodes.is_empty() {
let mut text = String::new();
for node in alt_nodes {
if let Node::Text(content) = node {
text.push_str(content);
}
}
Some(text)
} else {
None
};
// Try reading image file
if let Ok(img_data) = fs::read(url) {
Ok(self
.image_processor
.process_image_data(docx, &img_data, alt_text.as_deref(), None))
} else {
let placeholder = format!("[Image not found: {}]", url);
let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
Ok(docx.add_paragraph(para))
}
}
/// Process figure node (image with caption)
fn process_figure(&mut self, mut docx: Docx, figure_node: &FigureNode) -> Result<Docx> {
// First handle the figure body (typically an image)
match &*figure_node.body {
Node::Paragraph(content) => {
for node in content {
if let Node::Image {
url,
title: _,
alt: _,
} = node
{
// Process the image
if let Ok(img_data) = fs::read(url) {
let alt_text = figure_node.caption.clone();
// Add the image with caption
docx = self.image_processor.process_image_data(
docx,
&img_data,
Some(&alt_text),
None,
);
// Add caption as a separate paragraph with Caption style
if !figure_node.caption.is_empty() {
let caption_text = format!("Figure: {}", figure_node.caption);
let caption_para = Paragraph::new()
.style("Caption")
.add_run(Run::new().add_text(caption_text));
docx = docx.add_paragraph(caption_para);
}
} else {
// Image not found, show placeholder
let placeholder = format!("[Image not found: {}]", url);
let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
docx = docx.add_paragraph(para);
// Still add caption
if !figure_node.caption.is_empty() {
let caption_para = Paragraph::new()
.style("Caption")
.add_run(Run::new().add_text(&figure_node.caption));
docx = docx.add_paragraph(caption_para);
}
}
} else {
// Handle non-image content
let mut para = Paragraph::new();
let run = Run::new();
let run = self.process_inline_to_run(run, node)?;
if !run.children.is_empty() {
para = para.add_run(run);
docx = docx.add_paragraph(para);
}
// Add caption as a separate paragraph
if !figure_node.caption.is_empty() {
let caption_para = Paragraph::new()
.style("Caption")
.add_run(Run::new().add_text(&figure_node.caption));
docx = docx.add_paragraph(caption_para);
}
}
}
}
// Handle other content types within figure
_ => {
// Process the content using standard node processing
docx = self.process_node(docx, &figure_node.body)?;
// Add caption as a separate paragraph
if !figure_node.caption.is_empty() {
let caption_para = Paragraph::new()
.style("Caption")
.add_run(Run::new().add_text(&figure_node.caption));
docx = docx.add_paragraph(caption_para);
}
}
}
Ok(docx)
}
/// Process inline element and add to Run
fn process_inline_to_run(&self, mut run: Run, node: &Node) -> Result<Run> {
match node {
Node::Text(text) => {
run = run.add_text(text);
}
Node::Strong(content) => {
run = run.style("Strong");
for child in content {
run = self.process_inline_to_run(run, child)?;
}
}
Node::Emphasis(content) => {
run = run.style("Emphasis");
for child in content {
run = self.process_inline_to_run(run, child)?;
}
}
Node::Strikethrough(content) => {
run = run.strike();
for child in content {
run = self.process_inline_to_run(run, child)?;
}
}
Node::Link {
url: _,
title: _,
content,
} => {
// Hyperlinks need to be processed at paragraph level, only handle content here
run = run.style("Hyperlink");
for child in content {
run = self.process_inline_to_run(run, child)?;
}
}
Node::Image {
url,
title: _,
alt: _,
} => {
if let Ok(img_data) = fs::read(url) {
run = self.image_processor.process_inline_image(run, &img_data)?;
} else {
run = run.add_text(format!("[Image not found: {}]", url));
}
}
Node::HtmlElement(element) => {
// Handle special HTML elements
if element.tag == "mark" {
run = run.style("Highlight");
for child in &element.children {
run = self.process_inline_to_run(run, child)?;
}
} else if element.tag == "img" && element.self_closing {
let is_typst_block = element
.attributes
.iter()
.any(|a| a.name == "alt" && a.value == "typst-block");
let src = element
.attributes
.iter()
.find(|a| a.name == "src")
.map(|a| a.value.as_str())
.unwrap_or("");
if src.starts_with("data:image/") {
run = self.image_processor.process_data_url_image(
run,
src,
is_typst_block,
)?;
}
} else {
// Standard element content processing
for child in &element.children {
run = self.process_inline_to_run(run, child)?;
}
}
}
Node::InlineCode(code) => {
run = run.style("CodeInline").add_text(code);
}
Node::HardBreak => {
run = run.add_break(BreakType::TextWrapping);
}
Node::SoftBreak => {
run = run.add_text(" ");
}
// Other inline element types
_ => {}
}
Ok(run)
}
/// Process paragraph and add to document
fn process_paragraph(
&self,
mut docx: Docx,
content: &[Node],
style: Option<&str>,
) -> Result<Docx> {
let mut para = Paragraph::new();
// Apply style
if let Some(style_name) = style {
para = para.style(style_name);
}
// Extract all link nodes
let mut links = Vec::new();
for (i, node) in content.iter().enumerate() {
if let Node::Link {
url,
title: _,
content: _,
} = node
{
links.push((i, url.clone()));
}
}
// If no links, process paragraph normally
if links.is_empty() {
// Process paragraph content
for node in content {
let run = Run::new();
let run = self.process_inline_to_run(run, node)?;
if !run.children.is_empty() {
para = para.add_run(run);
}
}
} else {
// If links exist, we need to process in segments
let mut last_idx = 0;
for (idx, url) in links {
// Process content before the link
for item in content.iter().take(idx).skip(last_idx) {
let run = Run::new();
let run = self.process_inline_to_run(run, item)?;
if !run.children.is_empty() {
para = para.add_run(run);
}
}
// Process link
if let Node::Link {
url: _,
title: _,
content: link_content,
} = &content[idx]
{
let mut hyperlink_run = Run::new().style("Hyperlink");
for child in link_content {
hyperlink_run = self.process_inline_to_run(hyperlink_run, child)?;
}
// Create and add hyperlink
if !hyperlink_run.children.is_empty() {
let hyperlink =
Hyperlink::new(&url, HyperlinkType::External).add_run(hyperlink_run);
para = para.add_hyperlink(hyperlink);
}
}
last_idx = idx + 1;
}
// Process content after the last link
for item in content.iter().skip(last_idx) {
let run = Run::new();
let run = self.process_inline_to_run(run, item)?;
if !run.children.is_empty() {
para = para.add_run(run);
}
}
}
// Only add when paragraph has content
if !para.children.is_empty() {
docx = docx.add_paragraph(para);
}
Ok(docx)
}
/// Process node and add to document
fn process_node(&mut self, mut docx: Docx, node: &Node) -> Result<Docx> {
match node {
Node::Document(blocks) => {
for block in blocks {
docx = self.process_node(docx, block)?;
}
}
Node::Paragraph(content) => {
docx = self.process_paragraph(docx, content, None)?;
}
Node::Heading {
level,
content,
heading_type: _,
} => {
// Determine heading style name
let style_name = match level {
1 => "Heading1",
2 => "Heading2",
3 => "Heading3",
4 => "Heading4",
5 => "Heading5",
_ => "Heading6",
};
docx = self.process_paragraph(docx, content, Some(style_name))?;
}
Node::BlockQuote(content) => {
for block in content {
if let Node::Paragraph(inline) = block {
docx = self.process_paragraph(docx, inline, Some("Blockquote"))?;
} else {
docx = self.process_node(docx, block)?;
}
}
}
Node::CodeBlock {
language,
content,
block_type: _,
} => {
// Add language information
if let Some(lang) = language {
if !lang.is_empty() {
let lang_para = Paragraph::new()
.style("CodeBlock")
.add_run(Run::new().add_text(lang));
docx = docx.add_paragraph(lang_para);
}
}
// Process code line by line, preserving line breaks
let lines: Vec<&str> = content.split('\n').collect();
for line in lines {
let code_para = Paragraph::new()
.style("CodeBlock")
.add_run(Run::new().add_text(line));
docx = docx.add_paragraph(code_para);
}
}
Node::OrderedList { start: _, items } => {
docx = self.process_ordered_list(docx, items)?;
}
Node::UnorderedList(items) => {
docx = self.process_unordered_list(docx, items)?;
}
Node::Table {
headers,
rows,
alignments: _,
} => {
docx = self.process_table(docx, headers, rows)?;
}
Node::Image { url, title: _, alt } => {
docx = self.process_image(docx, url, alt)?;
}
Node::Custom(custom_node) => {
if let Some(figure_node) = custom_node.as_any().downcast_ref::<FigureNode>() {
// Process figure node with special handling
docx = self.process_figure(docx, figure_node)?;
} else if let Some(external_frame) = custom_node
.as_any()
.downcast_ref::<crate::common::ExternalFrameNode>(
) {
let data = base64::engine::general_purpose::STANDARD
.decode(&external_frame.svg_data)
.map_err(|e| format!("Failed to decode SVG data: {}", e))?;
docx = self.image_processor.process_image_data(
docx,
&data,
Some(&external_frame.alt_text),
None,
);
} else {
// Fallback for unknown custom nodes - ignore or add placeholder
let placeholder = "[Unknown custom content]";
let para = Paragraph::new().add_run(Run::new().add_text(placeholder));
docx = docx.add_paragraph(para);
}
}
Node::ThematicBreak => {
// Add horizontal line as specially formatted paragraph
let hr_para = Paragraph::new()
.style("HorizontalLine")
.add_run(Run::new().add_text(""));
docx = docx.add_paragraph(hr_para);
}
// Inline elements should not be processed here individually
_ => {}
}
Ok(docx)
}
/// Process ordered list
fn process_ordered_list(&mut self, mut docx: Docx, items: &[ListItem]) -> Result<Docx> {
// Enter deeper list level
self.list_level += 1;
let current_level = self.list_level - 1;
// Create new ordered list numbering definition
let (doc, num_id) = self.numbering.create_ordered_numbering(docx);
docx = doc;
// Process list items
for item in items {
if let ListItem::Ordered { content, .. } = item {
docx = self.process_list_item_content(docx, content, num_id, current_level)?;
}
}
// Exit list level
self.list_level -= 1;
Ok(docx)
}
/// Process unordered list
fn process_unordered_list(&mut self, mut docx: Docx, items: &[ListItem]) -> Result<Docx> {
// Enter deeper list level
self.list_level += 1;
let current_level = self.list_level - 1;
// Create new unordered list numbering definition
let (doc, num_id) = self.numbering.create_unordered_numbering(docx);
docx = doc;
// Process list items
for item in items {
if let ListItem::Unordered { content } = item {
docx = self.process_list_item_content(docx, content, num_id, current_level)?;
}
}
// Exit list level
self.list_level -= 1;
Ok(docx)
}
/// Helper function to process list item content
fn process_list_item_content(
&mut self,
mut docx: Docx,
content: &[Node],
num_id: usize,
level: usize,
) -> Result<Docx> {
// If content is empty, add empty paragraph
if content.is_empty() {
let empty_para = Paragraph::new()
.numbering(NumberingId::new(num_id), IndentLevel::new(level))
.add_run(Run::new().add_text(""));
return Ok(docx.add_paragraph(empty_para));
}
// Process content
for block in content {
match block {
Node::Paragraph(inline) => {
let mut para = Paragraph::new()
.numbering(NumberingId::new(num_id), IndentLevel::new(level));
// Process paragraph content
for node in inline {
let run = Run::new();
let run = self.process_inline_to_run(run, node)?;
if !run.children.is_empty() {
para = para.add_run(run);
}
}
docx = docx.add_paragraph(para);
}
// Recursively process nested lists
Node::OrderedList { start: _, items: _ } | Node::UnorderedList(_) => {
docx = self.process_node(docx, block)?;
}
_ => {
docx = self.process_node(docx, block)?;
}
}
}
Ok(docx)
}
/// Process table
fn process_table(&self, mut docx: Docx, headers: &[Node], rows: &[Vec<Node>]) -> Result<Docx> {
let mut table = Table::new(vec![]).style("Table");
// Process table headers
if !headers.is_empty() {
let mut cells = Vec::new();
for header_node in headers {
let mut table_cell = TableCell::new();
let mut para = Paragraph::new();
let run = Run::new();
let run = self.process_inline_to_run(run, header_node)?;
if !run.children.is_empty() {
para = para.add_run(run);
}
if !para.children.is_empty() {
table_cell = table_cell.add_paragraph(para);
}
cells.push(table_cell);
}
if !cells.is_empty() {
let header_row = TableRow::new(cells);
table = table.add_row(header_row);
}
}
// Process table rows
for row in rows {
let mut cells = Vec::new();
for cell_node in row {
let mut table_cell = TableCell::new();
let mut para = Paragraph::new();
let run = Run::new();
let run = self.process_inline_to_run(run, cell_node)?;
if !run.children.is_empty() {
para = para.add_run(run);
}
if !para.children.is_empty() {
table_cell = table_cell.add_paragraph(para);
}
cells.push(table_cell);
}
if !cells.is_empty() {
let data_row = TableRow::new(cells);
table = table.add_row(data_row);
}
}
// Add table to document
docx = docx.add_table(table);
Ok(docx)
}
/// Generate DOCX document
pub fn generate_docx(&mut self, doc: &Node) -> Result<Vec<u8>> {
// Create DOCX document and initialize styles
let mut docx = Docx::new();
docx = self.styles.initialize_styles(docx);
// Process document content
docx = self.process_node(docx, doc)?;
// Initialize numbering definitions
docx = self.numbering.initialize_numbering(docx);
// Build and pack document
let docx_built = docx.build();
let mut buffer = Vec::new();
docx_built
.pack(&mut Cursor::new(&mut buffer))
.map_err(|e| format!("Failed to pack DOCX: {}", e))?;
Ok(buffer)
}
}
impl FormatWriter for DocxWriter {
fn write_vec(&mut self, document: &Node) -> Result<Vec<u8>> {
self.list_level = 0;
self.list_numbering_count = 0;
self.generate_docx(document)
}
fn write_eco(&mut self, _document: &Node, _output: &mut EcoString) -> Result<()> {
Err("DOCX format does not support EcoString output".into())
}
}

View file

@ -1,8 +1,12 @@
//! Writer implementations for different output formats
#[cfg(feature = "docx")]
pub mod docx;
pub mod latex;
pub mod markdown;
#[cfg(feature = "docx")]
pub use self::docx::DocxWriter;
pub use latex::LaTeXWriter;
pub use markdown::MarkdownWriter;
@ -13,9 +17,8 @@ pub fn create_writer(format: Format) -> Box<dyn FormatWriter> {
match format {
Format::Md => Box::new(markdown::MarkdownWriter::new()),
Format::LaTeX => Box::new(latex::LaTeXWriter::new()),
Format::Docx => {
panic!("Docx writers are not implemented yet")
}
#[cfg(feature = "docx")]
Format::Docx => Box::new(docx::DocxWriter::new()),
}
}