Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

let client support more file types #74

Merged
merged 5 commits into from Mar 13, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
3 changes: 1 addition & 2 deletions README.md
Expand Up @@ -4,9 +4,8 @@ LlamaParse is an API created by LlamaIndex to efficiently parse and represent fi

LlamaParse directly integrates with [LlamaIndex](https://github.com/run-llama/llama_index).

Currently available in preview mode for **free**. Try it out today!

**NOTE:** Currently, only PDF files are supported.
Free plan is up to 1000 pages a day. Paid plan is free 7k pages per week + 0.3c per additional page.

## Getting Started

Expand Down
31 changes: 24 additions & 7 deletions llama_parse/base.py
Expand Up @@ -108,6 +108,19 @@ class Language(str, Enum):
VIETNAMESE = "vi"


SUPPORTED_FILE_TYPES = [
".pdf",
".xml"
".doc",
".docx",
".pptx",
".rtf",
".pages",
".key",
".epub"
]


class LlamaParse(BasePydanticReader):
"""A smart-parser for files."""

Expand Down Expand Up @@ -165,8 +178,12 @@ def validate_base_url(cls, v: str) -> str:
# upload a document and get back a job_id
async def _create_job(self, file_path: str, extra_info: Optional[dict] = None) -> str:
file_path = str(file_path)
if not file_path.endswith(".pdf"):
raise Exception("Currently, only PDF files are supported.")
file_ext = os.path.splitext(file_path)[1]
if file_ext not in SUPPORTED_FILE_TYPES:
raise Exception(
f"Currently, only the following file types are supported: {SUPPORTED_FILE_TYPES}\n"
f"Current file type: {file_ext}"
)

extra_info = extra_info or {}
extra_info["file_path"] = file_path
Expand All @@ -183,7 +200,7 @@ async def _create_job(self, file_path: str, extra_info: Optional[dict] = None) -
async with httpx.AsyncClient(timeout=self.max_timeout) as client:
response = await client.post(url, files=files, headers=headers, data={"language": self.language.value, "parsing_instruction": self.parsing_instruction})
if not response.is_success:
raise Exception(f"Failed to parse the PDF file: {response.text}")
raise Exception(f"Failed to parse the file: {response.text}")

# check the status of the job, return when done
job_id = response.json()["id"]
Expand All @@ -206,15 +223,15 @@ async def _get_job_result(self, job_id: str, result_type: str) -> dict:
end = time.time()
if end - start > self.max_timeout:
raise Exception(
f"Timeout while parsing the PDF file: {job_id}"
f"Timeout while parsing the file: {job_id}"
)
if self.verbose and tries % 10 == 0:
print(".", end="", flush=True)
continue

if result.status_code == 400:
detail = result.json().get("detail", "Unknown error")
raise Exception(f"Failed to parse the PDF file: {detail}")
raise Exception(f"Failed to parse the file: {detail}")

return result.json()

Expand All @@ -235,7 +252,7 @@ async def _aload_data(self, file_path: str, extra_info: Optional[dict] = None) -
]

except Exception as e:
print(f"Error while parsing the PDF file '{file_path}':", e)
print(f"Error while parsing the file '{file_path}':", e)
raise e
return []

Expand Down Expand Up @@ -283,7 +300,7 @@ async def _aget_json(self, file_path: str, extra_info: Optional[dict] = None) ->
return [result]

except Exception as e:
print(f"Error while parsing the PDF file '{file_path}':", e)
print(f"Error while parsing the file '{file_path}':", e)
raise e


Expand Down