From c6ec5afc8b4cc50cc2c1d862b6fa28fb9912a2be Mon Sep 17 00:00:00 2001 From: BikramK Date: Sat, 15 Nov 2025 12:53:32 +0000 Subject: [PATCH 1/2] Fix: added a couple more parsing logic to address #109 --- functions/import_pipeline/latex_utils.py | 53 +++++++++++++++++++----- 1 file changed, 43 insertions(+), 10 deletions(-) diff --git a/functions/import_pipeline/latex_utils.py b/functions/import_pipeline/latex_utils.py index cc1f236e..71e0fc87 100644 --- a/functions/import_pipeline/latex_utils.py +++ b/functions/import_pipeline/latex_utils.py @@ -18,9 +18,20 @@ import re import tarfile import warnings +from typing import Set from import_pipeline import latex_inline_command PREFERRED_MAIN_FILE_NAMES = ["main.tex", "ms.tex"] +INCLUDE_COMMANDS: Set[str] = { + r"\input{", + r"\include{", +} +EXCLUDED_DOCUMENT_CLASSES: Set[str] = { + "standalone", + "tikz", + "figure", + "subfile", +} def extract_tar_gz(source_bytes: bytes, destination_path: str): @@ -51,7 +62,7 @@ def find_main_tex_file(source_path: str) -> str: Raises: ValueError: If zero or more than one .tex file with `\\documentclass` is found. """ - valid_main_paths = [] + valid_main_files = [] for root, _, files in os.walk(source_path): for file in files: if file.endswith(".tex"): @@ -61,27 +72,49 @@ def find_main_tex_file(source_path: str) -> str: # Read file to check for \documentclass content = f.read() if r"\documentclass" in content: - valid_main_paths.append(full_path) + valid_main_files.append([full_path, content]) except Exception: # Ignore files that can't be opened or read continue - if len(valid_main_paths) == 0: + if len(valid_main_files) == 0: raise ValueError(f"Could not find a main .tex file in {source_path}") - if len(valid_main_paths) > 1: - # Check for 'main.tex' or 'ms.tex' as a tie-breaker + if len(valid_main_files) > 1: + # Rule 1: Check for 'main.tex' or 'ms.tex' as a tie-breaker preferred_files = [ - p - for p in valid_main_paths - if os.path.basename(p) in PREFERRED_MAIN_FILE_NAMES + path + for path, content in valid_main_files + if os.path.basename(path) in PREFERRED_MAIN_FILE_NAMES ] if len(preferred_files) == 1: return preferred_files[0] + + # Rule 2: Check for \input or \include + filtered_by_input = [ + (path, content) + for path, content in valid_main_files + if any(cmd in content for cmd in INCLUDE_COMMANDS) + ] + if len(filtered_by_input) == 1: + return filtered_by_input[0][0] + elif len(filtered_by_input) > 1: + # Rule 3: Exclude files using classes typically for figures or parts (e.g. 'standalone', 'tikz') + filtered_by_class = [ + (path, content) + for path, content in filtered_by_input + if not any( + f"\\documentclass{{{cls}}}" in content or f"\\documentclass[{cls}" in content + for cls in EXCLUDED_DOCUMENT_CLASSES + ) + ] + if len(filtered_by_class) == 1: + return filtered_by_class[0][0] + raise ValueError( - f"Found multiple competing main .tex files: {valid_main_paths}" + f"Found multiple competing main .tex files: {[x[0] for x in valid_main_files]}" ) - return valid_main_paths[0] + return valid_main_files[0][0] def inline_tex_files( From 1b9486bcbd5afb5c3c9d93fe21323d69f4068491 Mon Sep 17 00:00:00 2001 From: BikramK Date: Sun, 8 Feb 2026 21:23:47 +0000 Subject: [PATCH 2/2] Added tests for Rule 2 and 3 of find_main_tex_file() --- functions/import_pipeline/latex_utils_test.py | 39 +++++++++++++++---- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/functions/import_pipeline/latex_utils_test.py b/functions/import_pipeline/latex_utils_test.py index b599bacb..0de1b39b 100644 --- a/functions/import_pipeline/latex_utils_test.py +++ b/functions/import_pipeline/latex_utils_test.py @@ -114,6 +114,25 @@ def test_find_main_tex_file(self): r"\documentclass{article}", "main.tex", ), + ( + "multiple_main_one_with_input", + ["first.tex", "second.tex"], + [ + r"\documentclass{article}", + r"\documentclass{article}\n\input{section1}" + ], + "second.tex", + ), + ( + "multiple_main_one_with_input_without_figures", + ["first.tex", "second.tex", "third.tex"], + [ + r"\documentclass{article}\n\input{section1}\n\documentclass[tikz,border=2pt]{standalone}", + r"\documentclass{article}", + r"\documentclass{article}\n\input{section1}" + ], + "third.tex", + ), ] for name, file_paths, content, expected in test_cases: @@ -123,19 +142,25 @@ def test_find_main_tex_file(self): os.makedirs(sub_test_dir, exist_ok=True) # Write content to all specified files - for file_path in file_paths: - full_path = os.path.join(sub_test_dir, file_path) - os.makedirs(os.path.dirname(full_path), exist_ok=True) - with open(full_path, "w") as f: - f.write(content) + if isinstance(content, list): + for file_path, file_content in zip(file_paths, content): + full_path = os.path.join(sub_test_dir, file_path) + os.makedirs(os.path.dirname(full_path), exist_ok=True) + with open(full_path, "w") as f: + f.write(file_content) + else: + for file_path in file_paths: + full_path = os.path.join(sub_test_dir, file_path) + os.makedirs(os.path.dirname(full_path), exist_ok=True) + with open(full_path, "w") as f: + f.write(content) if isinstance(expected, type) and issubclass(expected, Exception): with self.assertRaises(expected): latex_utils.find_main_tex_file(sub_test_dir) else: result = latex_utils.find_main_tex_file(sub_test_dir) - self.assertEqual(result, os.path.join(sub_test_dir, expected)) - + self.assertEqual(os.path.normpath(result), os.path.normpath(os.path.join(sub_test_dir, expected))) # Cleanup sub-directory shutil.rmtree(sub_test_dir)