Skip to content

Commit

Permalink
let client support more file types (#74)
Browse files Browse the repository at this point in the history
  • Loading branch information
jerryjliu committed Mar 13, 2024
1 parent ab27f2a commit 393acf8
Show file tree
Hide file tree
Showing 2 changed files with 25 additions and 9 deletions.
3 changes: 1 addition & 2 deletions README.md
Expand Up @@ -4,9 +4,8 @@ LlamaParse is an API created by LlamaIndex to efficiently parse and represent fi

LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).

Currently available in preview mode for **free**. Try it out today!

**NOTE:** Currently, only PDF files are supported.
Free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page.

## Getting Started

Expand Down
31 changes: 24 additions & 7 deletions llama_parse/base.py
Expand Up @@ -108,6 +108,19 @@ class Language(str, Enum):
VIETNAMESE = "vi"


SUPPORTED_FILE_TYPES = [
".pdf",
".xml"
".doc",
".docx",
".pptx",
".rtf",
".pages",
".key",
".epub"
]


class LlamaParse(BasePydanticReader):
"""A smart-parser for files."""

Expand Down Expand Up @@ -165,8 +178,12 @@ def validate_base_url(cls, v: str) -> str:
# upload a document and get back a job_id
async def _create_job(self, file_path: str, extra_info: Optional[dict] = None) -> str:
file_path = str(file_path)
if not file_path.endswith(".pdf"):
raise Exception("Currently, only PDF files are supported.")
file_ext = os.path.splitext(file_path)[1]
if file_ext not in SUPPORTED_FILE_TYPES:
raise Exception(
f"Currently, only the following file types are supported: {SUPPORTED_FILE_TYPES}\n"
f"Current file type: {file_ext}"
)

extra_info = extra_info or {}
extra_info["file_path"] = file_path
Expand All @@ -183,7 +200,7 @@ async def _create_job(self, file_path: str, extra_info: Optional[dict] = None) -
async with httpx.AsyncClient(timeout=self.max_timeout) as client:
response = await client.post(url, files=files, headers=headers, data={"language": self.language.value, "parsing_instruction": self.parsing_instruction})
if not response.is_success:
raise Exception(f"Failed to parse the PDF file: {response.text}")
raise Exception(f"Failed to parse the file: {response.text}")

# check the status of the job, return when done
job_id = response.json()["id"]
Expand All @@ -206,15 +223,15 @@ async def _get_job_result(self, job_id: str, result_type: str) -> dict:
end = time.time()
if end - start > self.max_timeout:
raise Exception(
f"Timeout while parsing the PDF file: {job_id}"
f"Timeout while parsing the file: {job_id}"
)
if self.verbose and tries % 10 == 0:
print(".", end="", flush=True)
continue

if result.status_code == 400:
detail = result.json().get("detail", "Unknown error")
raise Exception(f"Failed to parse the PDF file: {detail}")
raise Exception(f"Failed to parse the file: {detail}")

return result.json()

Expand All @@ -235,7 +252,7 @@ async def _aload_data(self, file_path: str, extra_info: Optional[dict] = None) -
]

except Exception as e:
print(f"Error while parsing the PDF file '{file_path}':", e)
print(f"Error while parsing the file '{file_path}':", e)
raise e
return []

Expand Down Expand Up @@ -283,7 +300,7 @@ async def _aget_json(self, file_path: str, extra_info: Optional[dict] = None) ->
return [result]

except Exception as e:
print(f"Error while parsing the PDF file '{file_path}':", e)
print(f"Error while parsing the file '{file_path}':", e)
raise e


Expand Down

0 comments on commit 393acf8

Please sign in to comment.