Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[BUG] - Scrolling doesn't work in AWS Lambda #301

Open
joaofaria97 opened this issue Dec 4, 2023 · 1 comment
Open

[BUG] - Scrolling doesn't work in AWS Lambda #301

joaofaria97 opened this issue Dec 4, 2023 · 1 comment
Labels
bug Something isn't working

Comments

@joaofaria97
Copy link

Environment

  • chrome-aws-lambda Version: chrome-aws-lambda:22
  • puppeteer / puppeteer-core Version: 21.5.2
  • OS: Windows
  • Node.js Version: v18.18.0
  • Lambda / GCF Runtime: nodejs14.x

Expected Behavior

I currently have a page that needs to be scrolled down to fully load the elements I'm trying to scrape.
When I run my scraping code locally, it scrolls fine and successfully loads all the elements.

Snapshot after local run where all elements are successfully loaded:
1701686880746-elementCount_424

Current Behavior

However, when I run this in lambda the scrolling does not work.
The elements are not totally loaded (50 elements that are loaded in the beginning vs the 400+ that get loaded in when I run it locally.
I've tried using different selectors as targets to scroll to but none seem to work.

Snapshot after lambda run after scrolling is called:
1701688595177-elementCount_50

Steps to Reproduce

URL: https://sports.bwin.pt/pt/sports/futebol-4/apostar

const AWS = require('aws-sdk')
const s3 = new AWS.S3({apiVersion: '2006-03-01'});
const chromium = require('chrome-aws-lambda');

const pageURL = process.env.TARGET_URL
const agent = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.116 Safari/537.36'


const Bwin = require('./scrapers/bwin.js')
const db = require('./db.js')

exports.handler = async (event, context) => {

  let result = null;
  let browser = null;

  try {
    browser = await chromium.puppeteer.launch({
      args: chromium.args,
      defaultViewport: chromium.defaultViewport,
      executablePath: await chromium.executablePath,
      headless: chromium.headless,
      ignoreHTTPSErrors: true,
    });

    await db.connectToDb()

    let page = await browser.newPage();
    
    await page.setUserAgent(agent)

    console.log('Navigating to page: ', pageURL)

    await page.goto(pageURL, { waitUntil: 'networkidle2'})
    

    let bwin = new Bwin()
    let events = await bwin.scrapeEvents(page)
    console.log('length: ', events.length)

    const buffer = await page.screenshot()
    // upload the image using the current timestamp as filename
    const s3result = await s3
      .upload({
        Bucket: 'mybucket',
        Key: `${Date.now()}-${events.length}.png`,
        Body: buffer,
        ContentType: 'image/png',
        ACL: 'public-read'
      })
      .promise()
      
    console.log('S3 image URL:', s3result.Location)
    console.log('URL: ', page.url())
    await page.close();
    await browser.close();
    
  } catch (error) {
    console.log(error)
  } finally {
    if (browser !== null) {
      await browser.close();
    }
  }

  return result
}
const db = require('../db.js')
const AWS = require('aws-sdk')
const s3 = new AWS.S3({apiVersion: '2006-03-01'});

class Scraper {
    async scrapeEvents(page) {
        await this.loadPage(page)

        page.on('console', msg => console.log('PAGE LOG:', msg.text()));

        await this.loadAllElements(page, this.eventSelector)
        let events = await page.$$eval(this.eventSelector, this.getEventInfo)
        events = events.map(event => this.parseEventInfo(event))
        return events
    }

    
    async loadPage(page) {
        await this.closePopUp(page);
        await this.loadElements(page, this.eventSelector, 0)
    }

    async loadElements(page, elementSelector, elementCount) {
        console.log('count: ', elementCount)

        const buffer = await page.screenshot()
        // upload the image using the current timestamp as filename
        const s3result = await s3
        .upload({
            Bucket: 'mybucket',
            Key: `${Date.now()}-elementCount:${elementCount}.png`,
            Body: buffer,
            ContentType: 'image/png',
            ACL: 'public-read'
        })
        .promise()
        
        console.log('S3 image URL:', s3result.Location)

        try {
            await page.waitForFunction((elementSelector, elementCount) => {
                return document.querySelectorAll(elementSelector).length != elementCount;
            }, { timeout: 30000 }, elementSelector, elementCount);
        } catch (error) {
            throw error
        }
    }

    async loadAllElements(page, elementSelector) {
        try {
            while(true) {
                let elementCount = await page.evaluate(this.scrollToBottom, this.scrollableSelector, elementSelector)
                await this.loadElements(page, elementSelector, elementCount)
            }
        } catch(error) {
            console.error(error)
        }
    }

    async closePopUp(page) {
        await this.loadElements(page, this.popupSelector, 0)
        await page.evaluate((sel) => document.querySelector(sel).click(), this.popupSelector)
        console.log('popup closed')
    }

    scrollToBottom(scrollableSelector, elementSelector) {
        let elementCount = document.querySelectorAll(elementSelector).length;
        document.querySelector(scrollableSelector).scrollIntoView({ behavior:"smooth", block: "end" })
        console.log('SCROLLED TO BOTTOM')
        return elementCount;
    }
}

module.exports = Scraper;
const Scraper = require('./scraper.js')

class Bwin extends Scraper {
    eventSelector = 'ms-event'
    popupSelector = 'button#onetrust-accept-btn-handler'
    // scrollableSelector = 'div#main-view'
    scrollableSelector = 'div.grid-footer'

    async getEventInfo(events) {
        return events.map(event => {
            return {
                home: event.querySelector('div.participant-wrapper:nth-child(1)').textContent.trim(),
                away: event.querySelector('div.participant-wrapper:nth-child(2)').textContent.trim(),
                date: event.querySelector('ms-event-timer').textContent.trim(),
                competition: event.closest('ms-event-group').querySelector('ms-league-header').textContent.trim()
            }
        })
    }

    parseEventInfo(event) {
        let home = event.home
        let away = event.away

        let [country, competition] = this.parseCompetition(event.competition)
        let date = this.parseDate(event.date)
        return {
            home,
            away,
            date,
            country,
            competition
        }
    }

    parseCompetition(competition) {
        let country
        [country, competition] = competition.split('|').map(str => str.trim())
        return [country, competition]
    }

    parseDate(dateStr) {
        try {
            let date = new Date()
            if (dateStr.includes('Hoje') || dateStr.includes('Amanhã')) {
              let [hour, minute] = dateStr.split('/')[1].trim().split(':').map(t => Number(t))
              date.setHours(hour, minute, 0, 0)
              
              if (dateStr.includes('Amanhã')) date.setDate(date.getDate() + 1)
            } else {
              let [datePart, timeStr] = dateStr.split(' ')
              let [day, month, year] = datePart.split('/').map(t => Number(t))
              let [hour, minute] = timeStr.trim().split(':').map(t => Number(t))
              
              date = new Date(year, --month, day, hour, minute, 0, 0)
            }
            return date
          } catch (error) {
            console.log(error)
          }
    }
}

module.exports = Bwin;

Has anyone experienced this?

@joaofaria97 joaofaria97 added the bug Something isn't working label Dec 4, 2023
@joaofaria97
Copy link
Author

Also tried to make the viewports equal, still doesn't scroll down

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
bug Something isn't working
Projects
None yet
Development

No branches or pull requests

1 participant