Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 43 additions & 10 deletions functions/import_pipeline/latex_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,20 @@
import re
import tarfile
import warnings
from typing import Set
from import_pipeline import latex_inline_command

PREFERRED_MAIN_FILE_NAMES = ["main.tex", "ms.tex"]
INCLUDE_COMMANDS: Set[str] = {
r"\input{",
r"\include{",
}
EXCLUDED_DOCUMENT_CLASSES: Set[str] = {
"standalone",
"tikz",
"figure",
"subfile",
}


def extract_tar_gz(source_bytes: bytes, destination_path: str):
Expand Down Expand Up @@ -51,7 +62,7 @@ def find_main_tex_file(source_path: str) -> str:
Raises:
ValueError: If zero or more than one .tex file with `\\documentclass` is found.
"""
valid_main_paths = []
valid_main_files = []
for root, _, files in os.walk(source_path):
for file in files:
if file.endswith(".tex"):
Expand All @@ -61,27 +72,49 @@ def find_main_tex_file(source_path: str) -> str:
# Read file to check for \documentclass
content = f.read()
if r"\documentclass" in content:
valid_main_paths.append(full_path)
valid_main_files.append([full_path, content])
except Exception:
# Ignore files that can't be opened or read
continue
if len(valid_main_paths) == 0:
if len(valid_main_files) == 0:
raise ValueError(f"Could not find a main .tex file in {source_path}")

if len(valid_main_paths) > 1:
# Check for 'main.tex' or 'ms.tex' as a tie-breaker
if len(valid_main_files) > 1:
# Rule 1: Check for 'main.tex' or 'ms.tex' as a tie-breaker
preferred_files = [
p
for p in valid_main_paths
if os.path.basename(p) in PREFERRED_MAIN_FILE_NAMES
path
for path, content in valid_main_files
if os.path.basename(path) in PREFERRED_MAIN_FILE_NAMES
]
if len(preferred_files) == 1:
return preferred_files[0]

# Rule 2: Check for \input or \include
filtered_by_input = [
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you add test cases for rules 2 and 3?

(path, content)
for path, content in valid_main_files
if any(cmd in content for cmd in INCLUDE_COMMANDS)
]
if len(filtered_by_input) == 1:
return filtered_by_input[0][0]
elif len(filtered_by_input) > 1:
# Rule 3: Exclude files using classes typically for figures or parts (e.g. 'standalone', 'tikz')
filtered_by_class = [
(path, content)
for path, content in filtered_by_input
if not any(
f"\\documentclass{{{cls}}}" in content or f"\\documentclass[{cls}" in content
for cls in EXCLUDED_DOCUMENT_CLASSES
)
]
if len(filtered_by_class) == 1:
return filtered_by_class[0][0]

raise ValueError(
f"Found multiple competing main .tex files: {valid_main_paths}"
f"Found multiple competing main .tex files: {[x[0] for x in valid_main_files]}"
)

return valid_main_paths[0]
return valid_main_files[0][0]


def inline_tex_files(
Expand Down
39 changes: 32 additions & 7 deletions functions/import_pipeline/latex_utils_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,6 +114,25 @@ def test_find_main_tex_file(self):
r"\documentclass{article}",
"main.tex",
),
(
"multiple_main_one_with_input",
["first.tex", "second.tex"],
[
r"\documentclass{article}",
r"\documentclass{article}\n\input{section1}"
],
"second.tex",
),
(
"multiple_main_one_with_input_without_figures",
["first.tex", "second.tex", "third.tex"],
[
r"\documentclass{article}\n\input{section1}\n\documentclass[tikz,border=2pt]{standalone}",
r"\documentclass{article}",
r"\documentclass{article}\n\input{section1}"
],
"third.tex",
),
]

for name, file_paths, content, expected in test_cases:
Expand All @@ -123,19 +142,25 @@ def test_find_main_tex_file(self):
os.makedirs(sub_test_dir, exist_ok=True)

# Write content to all specified files
for file_path in file_paths:
full_path = os.path.join(sub_test_dir, file_path)
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "w") as f:
f.write(content)
if isinstance(content, list):
for file_path, file_content in zip(file_paths, content):
full_path = os.path.join(sub_test_dir, file_path)
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "w") as f:
f.write(file_content)
else:
for file_path in file_paths:
full_path = os.path.join(sub_test_dir, file_path)
os.makedirs(os.path.dirname(full_path), exist_ok=True)
with open(full_path, "w") as f:
f.write(content)

if isinstance(expected, type) and issubclass(expected, Exception):
with self.assertRaises(expected):
latex_utils.find_main_tex_file(sub_test_dir)
else:
result = latex_utils.find_main_tex_file(sub_test_dir)
self.assertEqual(result, os.path.join(sub_test_dir, expected))

self.assertEqual(os.path.normpath(result), os.path.normpath(os.path.join(sub_test_dir, expected)))
# Cleanup sub-directory
shutil.rmtree(sub_test_dir)

Expand Down