diff --git a/botcity/web/__init__.py b/botcity/web/__init__.py index bacac8a..ca888b1 100644 --- a/botcity/web/__init__.py +++ b/botcity/web/__init__.py @@ -1,4 +1,6 @@ from .bot import WebBot, Browser, BROWSER_CONFIGS, By # noqa: F401, F403 +from .parsers import table_to_dict, data_from_row, sanitize_header # noqa: F401, F403 +from .util import element_as_select # noqa: F401, F403 from botcity.web._version import get_versions __version__ = get_versions()['version'] diff --git a/botcity/web/bot.py b/botcity/web/bot.py index 3564453..6b9face 100644 --- a/botcity/web/bot.py +++ b/botcity/web/bot.py @@ -10,6 +10,7 @@ import shutil import time from typing import List +from contextlib import contextmanager from botcity.base import BaseBot, State from botcity.base.utils import only_if_element @@ -21,7 +22,8 @@ from selenium.webdriver.common.by import By from selenium.webdriver.common.keys import Keys from selenium.webdriver.remote.webelement import WebElement -from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support.wait import WebDriverWait, TimeoutException, NoSuchElementException +from selenium.webdriver.support import expected_conditions as EC from . import config, cv2find from .browsers import BROWSER_CONFIGS, Browser @@ -233,11 +235,16 @@ def check_driver(): def stop_browser(self): """ Stops the Chrome browser and clean up the User Data Directory. + + Warning: + After invoking this method, you will need to reassign your custom options and capabilities. """ if not self._driver: return self._driver.close() self._driver.quit() + self.options = None + self.capabilities = None self._driver = None def set_screen_resolution(self, width=None, height=None): @@ -854,6 +861,26 @@ def browse(self, url): """ self.navigate_to(url) + @contextmanager + def wait_for_new_page(self, waiting_time=10000, activate=True): + """Context manager to wait for a new page to load and activate it. + + Args: + waiting_time (int, optional): The maximum waiting time. Defaults to 10000. + activate (bool, optional): Whether or not to activate the new page. Defaults to True. + + """ + tabs = self.get_tabs() + yield + start_time = time.time() + while tabs == self.get_tabs(): + elapsed_time = (time.time() - start_time) * 1000 + if elapsed_time > waiting_time: + return None + time.sleep(0.1) + if activate: + self.activate_tab(self.get_tabs()[-1]) + def execute_javascript(self, code): """ Execute the given javascript code. @@ -1032,15 +1059,19 @@ def wait_for_downloads(self, timeout: int = 120000): wait_method = BROWSER_CONFIGS.get(self.browser).get("wait_for_downloads") # waits for all the files to be completed - WebDriverWait(self._driver, timeout/1000, 1).until(wait_method) + WebDriverWait(self._driver, timeout/1000.0, 1).until(wait_method) - def find_elements(self, selector: str, by: By = By.CSS_SELECTOR) -> List[WebElement]: + def find_elements(self, selector: str, by: By = By.CSS_SELECTOR, + waiting_time=10000, ensure_visible: bool = True) -> List[WebElement]: """Find elements using the specified selector with selector type specified by `by`. Args: selector (str): The selector string to be used. by (str, optional): Selector type. Defaults to By.CSS_SELECTOR. [See more](https://selenium-python.readthedocs.io/api.html#selenium.webdriver.common.by.By) + waiting_time (int, optional): Maximum wait time (ms) to search for a hit. + Defaults to 10000ms (10s). + ensure_visible (bool, optional): Whether to wait for the element to be visible. Defaults to True. Returns: List[WebElement]: List of elements found. @@ -1054,9 +1085,24 @@ def find_elements(self, selector: str, by: By = By.CSS_SELECTOR) -> List[WebElem ... ``` """ - return self._driver.find_elements(by, selector) + if ensure_visible: + condition = EC.visibility_of_all_elements_located + else: + condition = EC.presence_of_all_elements_located - def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement: + try: + elements = WebDriverWait( + self._driver, timeout=waiting_time / 1000.0 + ).until( + condition((by, selector)) + ) + return elements + except (TimeoutException, NoSuchElementException) as ex: + print("Exception on find_elements", ex) + return None + + def find_element(self, selector: str, by: str = By.CSS_SELECTOR, waiting_time=10000, + ensure_visible: bool = False, ensure_clickable: bool = False) -> WebElement: """Find an element using the specified selector with selector type specified by `by`. If more than one element is found, the first instance is returned. @@ -1064,6 +1110,11 @@ def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement: selector (str): The selector string to be used. by (str, optional): Selector type. Defaults to By.CSS_SELECTOR. [See more](https://selenium-python.readthedocs.io/api.html#selenium.webdriver.common.by.By) + waiting_time (int, optional): Maximum wait time (ms) to search for a hit. + Defaults to 10000ms (10s). + ensure_visible (bool, optional): Whether to wait for the element to be visible. Defaults to False. + ensure_clickable (bool, optional): Whether to wait for the element to be clickable. Defaults to False. + If True, `ensure_clickable` takes precedence over `ensure_visible`. Returns: WebElement: The element found. @@ -1079,9 +1130,47 @@ def find_element(self, selector: str, by: str = By.CSS_SELECTOR) -> WebElement: ... ``` """ - out = self.find_elements(selector=selector, by=by) - if out: - return out[0] + condition = EC.visibility_of_element_located if ensure_visible else EC.presence_of_element_located + condition = EC.element_to_be_clickable if ensure_clickable else condition + + try: + element = WebDriverWait( + self._driver, timeout=waiting_time/1000.0 + ).until( + condition((by, selector)) + ) + return element + except (TimeoutException, NoSuchElementException): + return None + + def wait_for_stale_element(self, element: WebElement, timeout: int = 10000): + """ + Wait until the WebElement element becomes stale (outdated). + + Args: + element (WebElement): The element to monitor for staleness. + timeout (int, optional): Timeout in millis. Defaults to 120000. + """ + try: + WebDriverWait(self._driver, timeout=timeout/1000.0).until(EC.staleness_of(element)) + except (TimeoutException, NoSuchElementException): + pass + + def wait_for_element_visibility(self, element: WebElement, visible: bool = True, waiting_time=10000): + """Wait for the element to be visible or hidden. + + Args: + element (WebElement): The element to wait for. + visible (bool, optional): Whether to wait for the element to be visible. Defaults to True. + waiting_time (int, optional): Maximum wait time (ms) to search for a hit. + Defaults to 10000ms (10s). + """ + if visible: + wait_method = EC.visibility_of + else: + wait_method = EC.invisibility_of_element + + WebDriverWait(self._driver, timeout=waiting_time/1000.0).until(wait_method(element)) def set_file_input_element(self, element: WebElement, filepath: str): """Configure the filepath for upload in a file element. diff --git a/botcity/web/parsers.py b/botcity/web/parsers.py new file mode 100644 index 0000000..6228cc7 --- /dev/null +++ b/botcity/web/parsers.py @@ -0,0 +1,90 @@ +import collections +import string +from typing import Dict, List +from selenium.webdriver.remote.webelement import WebElement + + +def data_from_row(row: WebElement, cell_tag="td") -> List[str]: + """Extract data from a row and return it as a list. + + Args: + row (WebElement): The row element. + cell_tag (str, optional): The HTML tag associated with the row cells. Defaults to "td". + + Returns: + list: List of strings with the contents. + """ + return [ + col.text for col in row.find_elements_by_tag_name(cell_tag) + ] + + +def sanitize_header(labels: List[str]): + """Sanitize header labels.""" + # Handle Treat Empty Header + for idx, label in enumerate(labels): + if label.strip(): + # make it lowercase + label = label.lower() + + # remove punctuations + label = ''.join([l for l in label if l not in string.punctuation]) # noqa: E741 + + # replace spaces with underscores + label = label.replace(" ", "_") + else: + label = f"col_{idx}" + labels[idx] = label + + # Deduplicate by adding _1, _2, _3 to repeated labels + counts = {k: v for k, v in collections.Counter(labels).items() if v > 1} + for i in reversed(range(len(labels))): + item = labels[i] + if item in counts and counts[item]: + labels[i] = f"{item}_{counts[item]}" + counts[item] -= 1 + + return labels + + +def table_to_dict(table: WebElement, has_header: bool = True, + skip_rows: int = 0, header_tag: str = "th") -> List[Dict]: + """Convert a table WebElement to a dict of lists. + + Args: + table (WebElement): The table element. + has_header (bool, optional): Whether or not to parse a header. Defaults to True. + skip_rows (int, optional): Number of rows to skip from the top. Defaults to 0. + header_tag (str, optional): The HTML tag associated with the header cell. Defaults to "th". + + Returns: + list: List with dict for each row. + """ + + # Collect all rows from table + rows = table.find_elements_by_tag_name("tr") + + # Skip rows if informed + if skip_rows: + rows = rows[skip_rows:] + + # Parse header labels + if has_header: + # Read header labels + labels = data_from_row(rows[0], cell_tag=header_tag) + # Sanitize headers + labels = sanitize_header(labels) + # Skip the header + rows = rows[1:] + else: + # Make up header labels + num_cols = len(rows[0].find_elements_by_tag_name("td")) + labels = [f"col_{i}" for i in range(num_cols)] + + # Assemble output dictionary + out_list = [] + for row in rows: + row_data = data_from_row(row) + out_list.append(dict(zip(labels, row_data))) + + return out_list diff --git a/botcity/web/util.py b/botcity/web/util.py index 96041d0..5eb468b 100644 --- a/botcity/web/util.py +++ b/botcity/web/util.py @@ -1,9 +1,31 @@ import shutil +import tempfile +from selenium.webdriver.remote.webelement import WebElement +from selenium.webdriver.support.select import Select -def cleanup_temp_dir(temp_dir): + +def cleanup_temp_dir(temp_dir: tempfile.TemporaryDirectory) -> None: + """ + Deletes the temporary directory and all its contents. + + Args: + temp_dir (tempfile.TemporaryDirectory): The temporary directory to delete. + """ if temp_dir: try: temp_dir.cleanup() except OSError: shutil.rmtree(temp_dir.name, ignore_errors=True) + + +def element_as_select(element: WebElement) -> Select: + """Wraps a WebElement in a Select object. + + Args: + element (WebElement): The element to wrap. + + Returns: + Select: The Select object. + """ + return Select(element) diff --git a/docs/forms.md b/docs/forms.md new file mode 100644 index 0000000..d3d5bec --- /dev/null +++ b/docs/forms.md @@ -0,0 +1,44 @@ +# Interacting with Forms + +When dealing with forms, we often need to fill in the form and submit it. + +While most of the operations are trivial, there are some things that are not such as selecting a select element or dealing with file uploads. + +For that we developed some utilitary functions that you can use. + +## Select Element + +After grabing the element via the `find_element` or `find_elements` functions, we can use the `element_as_select` to convert it into a `Select` object. + +::: botcity.web.util.element_as_select + +### Example usage + +```python +# Import the function +from botcity.web.util import element_as_select +... +# Fetch the select element +element = self.find_element("select", By.TAG_NAME) +# Convert the element into a Select object +select_element = element_as_select(element) +# Select the option based on visible text +select_element.select_by_visible_text("Option 1") +... +``` + +## File Upload + +After grabing the element via the `find_element` or `find_elements` functions, we can use the `set_file_input_element` to assign the file path to the element. + +### Example usage + +```python +from botcity.web import By +... +# Find the input element of type `file` using CSS_SELECTOR. +elem = self.find_element("body > form > input[type=file]", By.CSS_SELECTOR) +# Configure the file to be used when processing the upload +self.set_file_input_element(elem, "./test.txt") +... +``` diff --git a/docs/intro.md b/docs/intro.md index 40492c3..1c9731d 100644 --- a/docs/intro.md +++ b/docs/intro.md @@ -63,14 +63,14 @@ class Bot(WebBot): # Changes the Browser to Firefox self.browser = Browser.FIREFOX - + # For Chrome # self.browser = Browser.CHROME - + ... ``` -From the snippet above the key takeaway is the `self.browser` piece in which we set it to one of the values +From the snippet above the key takeaway is the `self.browser` piece in which we set it to one of the values from the `Browser` *enum* as mentioned before. #### Defining the WebDriver Path @@ -90,7 +90,7 @@ class Bot(WebBot): # Inform the WebDriver path for Google Chrome's chromedriver self.driver_path = "/home/username/drivers/chromedriver" - + ... ``` @@ -101,20 +101,20 @@ By default the browsers are launched with a set of curated options which we pick Before getting into how to customize those details let's walk through some of the assumptions and details which are covered by the `default options`. -- **Headless Execution**: Depending on the `headless` property set on your Bot class we pick the +- **Headless Execution**: Depending on the `headless` property set on your Bot class we pick the proper configuration to launch the browser in the desired mode. - + - **Downloads Folder Path**: By default we save all downloaded files on the Desktop folder. - **User Profile**: By default we generate a temporary directory (which is later erased) to be used as the profile directory. This procedure ensure that every execution starts with a clean browser session and things such as cookies and stored passwords or certificates from one execution won't interfere with the others. - -A handful of other options are also set and they can be inspected on the source code for each browser on the + +A handful of other options are also set and they can be inspected on the source code for each browser on the `botcity.web.browsers` module. -If you really need to customize the options you can do so via the `options` property. You can fetch +If you really need to customize the options you can do so via the `options` property. You can fetch the `default options` curated by BotCity and make your changes or start your options from scratch. In the following snippet we will cover how to build on top of the existing options. @@ -140,13 +140,13 @@ class Bot(WebBot): download_folder_path=self.download_folder_path, user_data_dir=None # Informing None here will generate a temporary directory ) - + # Add your customized argument def_options.add_argument("") - + # Update the options to use the customized Options. self.options = def_options - + ... ``` @@ -156,6 +156,16 @@ Every supported browser will have an exclusive module with curated default optio This function takes in arguments to define the mode of execution (headless or not), default download folder path and user data/profile directory. +### Parsing Data + +When extracting data from websites we often need to read information from tables and other common sources. +The `Handling Data` section highlights how we can do that and which functions are available to be used. + +### Forms + +It is common while navigating websites to have to fill up forms and submit them. +To facilitate this we have a `Interacting with Forms` section which highlights the available functions to be used. + ### Next Steps Check our examples and experiment with the API. diff --git a/docs/parsers.md b/docs/parsers.md new file mode 100644 index 0000000..df6084c --- /dev/null +++ b/docs/parsers.md @@ -0,0 +1,67 @@ +# Handling Data + +## Tables + +To extract data from tables we offer an utility function which parses the table and returns a list of dictionaries. + +::: botcity.web.parsers.table_to_dict + +### Usage Example + +Let's fetch data from the example table available at the [W3Schools website](https://www.w3schools.com/html/html_tables.asp). + + +```python +class Bot(WebBot): + def action(self, execution=None): + self.browse("https://www.w3schools.com/html/html_tables.asp") + + # Fetch the table + table = self.find_element("table", By.TAG_NAME) + + # Parse the table + parsed_table = table_to_dict(table) + + # Print the parsed table + print(parsed_table) + + # Close the browser and free resources + self.stop_browser() +``` + +The output should look like this: + +```python +[ + { + 'company': 'Alfreds Futterkiste', + 'contact': 'Maria Anders', + 'country': 'Germany' + }, + { + 'company': 'Centro comercial Moctezuma', + 'contact': 'Francisco Chang', + 'country': 'Mexico' + }, + { + 'company': 'Ernst Handel', + 'contact': 'Roland Mendel', + 'country': 'Austria' + }, + { + 'company': 'Island Trading', + 'contact': 'Helen Bennett', + 'country': 'UK' + }, + { + 'company': 'Laughing Bacchus Winecellars', + 'contact': 'Yoshi Tannamuri', + 'country': 'Canada' + }, + { + 'company': 'Magazzini Alimentari Riuniti', + 'contact': 'Giovanni Rovelli', + 'country': 'Italy' + } +] +``` \ No newline at end of file diff --git a/mkdocs.yml b/mkdocs.yml index a68dc92..30a68a6 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -7,6 +7,8 @@ nav: - Home: index.md - Getting Started: intro.md - Framework: bot.md + - Handling Data: parsers.md + - Interacting with Forms: forms.md - Browsers: browsers.md theme: @@ -32,6 +34,8 @@ theme: name: Switch to light mode markdown_extensions: + - admonition + - pymdownx.details - pymdownx.highlight - pymdownx.superfences