diff --git a/README.md b/README.md index 4f76025..7741ba0 100644 --- a/README.md +++ b/README.md @@ -4,9 +4,8 @@ LlamaParse is an API created by LlamaIndex to efficiently parse and represent fi LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index). -Currently available in preview mode for **free**. Try it out today! -**NOTE:** Currently, only PDF files are supported. +Free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page. ## Getting Started diff --git a/llama_parse/base.py b/llama_parse/base.py index 07ea39b..8341418 100644 --- a/llama_parse/base.py +++ b/llama_parse/base.py @@ -108,6 +108,19 @@ class Language(str, Enum): VIETNAMESE = "vi" +SUPPORTED_FILE_TYPES = [ + ".pdf", + ".xml" + ".doc", + ".docx", + ".pptx", + ".rtf", + ".pages", + ".key", + ".epub" +] + + class LlamaParse(BasePydanticReader): """A smart-parser for files.""" @@ -165,8 +178,12 @@ def validate_base_url(cls, v: str) -> str: # upload a document and get back a job_id async def _create_job(self, file_path: str, extra_info: Optional[dict] = None) -> str: file_path = str(file_path) - if not file_path.endswith(".pdf"): - raise Exception("Currently, only PDF files are supported.") + file_ext = os.path.splitext(file_path)[1] + if file_ext not in SUPPORTED_FILE_TYPES: + raise Exception( + f"Currently, only the following file types are supported: {SUPPORTED_FILE_TYPES}\n" + f"Current file type: {file_ext}" + ) extra_info = extra_info or {} extra_info["file_path"] = file_path @@ -183,7 +200,7 @@ async def _create_job(self, file_path: str, extra_info: Optional[dict] = None) - async with httpx.AsyncClient(timeout=self.max_timeout) as client: response = await client.post(url, files=files, headers=headers, data={"language": self.language.value, "parsing_instruction": self.parsing_instruction}) if not response.is_success: - raise Exception(f"Failed to parse the PDF file: {response.text}") + raise Exception(f"Failed to parse the file: {response.text}") # check the status of the job, return when done job_id = response.json()["id"] @@ -206,7 +223,7 @@ async def _get_job_result(self, job_id: str, result_type: str) -> dict: end = time.time() if end - start > self.max_timeout: raise Exception( - f"Timeout while parsing the PDF file: {job_id}" + f"Timeout while parsing the file: {job_id}" ) if self.verbose and tries % 10 == 0: print(".", end="", flush=True) @@ -214,7 +231,7 @@ async def _get_job_result(self, job_id: str, result_type: str) -> dict: if result.status_code == 400: detail = result.json().get("detail", "Unknown error") - raise Exception(f"Failed to parse the PDF file: {detail}") + raise Exception(f"Failed to parse the file: {detail}") return result.json() @@ -235,7 +252,7 @@ async def _aload_data(self, file_path: str, extra_info: Optional[dict] = None) - ] except Exception as e: - print(f"Error while parsing the PDF file '{file_path}':", e) + print(f"Error while parsing the file '{file_path}':", e) raise e return [] @@ -283,7 +300,7 @@ async def _aget_json(self, file_path: str, extra_info: Optional[dict] = None) -> return [result] except Exception as e: - print(f"Error while parsing the PDF file '{file_path}':", e) + print(f"Error while parsing the file '{file_path}':", e) raise e