Login to Scrap Data and Send to Server in Python

In the following tutorials you will learn that how to login to website to create session to navigation, and scrap specific data and send to server in JSON format using Python programming language.

Login to Scrap Data and Send to Server

Program

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re
import json
from bs4 import BeautifulSoup
import requests

print("Initializing credentials")
USERNAME = "email@example.com"
PASSWORD = "Admin123"

LOGIN_URL = "https://example-website.com/home?v=login"
TableURL = "https://example-website.com/views/343df34f"
SEND_DATA_URL = 'http://localhost/scraping/data'

print("Initializing drivers")
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

options = Options()
options.headless = True
options.add_argument("--headless")
driver = webdriver.Chrome()
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
print("Loging in")
driver.get(LOGIN_URL)
driver.find_element(By.ID, "loginEmail").send_keys(USERNAME)
driver.find_element(By.ID, "formControl").click()
driver.find_element(By.ID, "loginPassword").send_keys(PASSWORD)
driver.find_element(By.ID, "formControl").click()

def startProcess():
    try:
        print("Waiting for data table to load")
        driver.get(TableURL)
       # driver.implicitly_wait(30)
        WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, '(//table)[1]'))).get_attribute(
            "outerHTML")

        print("Getting table source")
        index = 0
        result = []
        soup = BeautifulSoup(driver.page_source, "lxml")
        print("Getting all rows from table")
        for table in soup.find_all('table'):
            if (index == 1):
                for row in table.find_all('tr'):
                    columns = row.find_all('td')

                    order = columns[1]
                    status = columns[2]
                    vin = columns[3]
                    pre_sold_customer = columns[4]
                    load = columns[5]
                    description = columns[6]
                    add_ons = columns[7]
                    standard_features = columns[8]
                    color = columns[9]
                    ship_to = columns[10]

                    item = {
                        'order': re.sub(r"[\n\t]*", "", order.get_text().strip()),
                        'status': re.sub(r"[\n]", " ", re.sub(r"[\t]*", "", status.get_text().strip())),
                        'vin': re.sub(r"[\n\t]*", "", vin.get_text().strip()),
                        'pre_sold_customer': re.sub(r"[\n\t]*", "", pre_sold_customer.get_text().strip()),
                        'load': re.sub(r"[\n\t]*", "", load.get_text().strip()),
                        'description': re.sub(r"[\n\t]*", "", description.get_text().strip()),
                        'add_ons': re.sub(r"[\n\t]*", "", add_ons.get_text().strip()),
                        'standard_features': re.sub(r"[\n\t]*", "", standard_features.get_text().strip()),
                        'color': re.sub(r"[\n\t]*", "", color.get_text().strip()),
                        'ship_to': re.sub(r"[\n\t]*", "", ship_to.get_text().strip())
                    }
                    result.append(item)

            index += 1

        sendData(json.dumps(result, ensure_ascii='False'))

        driver.quit()
    except TimeoutException:
        print("Exception occurred: Desired url was not rendered with in allocated time")
        print(driver.page_source)
        driver.quit()

def sendData(json):
    print("Sending data to server, and waiting for response. Please wait it can take a while")
    print(json)
   # response = requests.post(SEND_DATA_URL, data={'json':json})
    print("-------Server Response-------")
    # print(response.text)
    print("-------Server Response Finished-------")

startProcess()
Output
Initializing credentials
Initializing drivers
Loging in
Waiting for data table to load
Getting table source
Getting all rows from table
Sending data to server, and waiting for response. Please wait it can take a while
{'order': '....', 'status': '....', 'vin': '....', 'pre_sold_customer': '....', 'load': '', 'description': '....', 'add_ons': '....', 'standard_features': '....', 'color': '....', 'ship_to': '....'}
-------Server Response-------
All data processed and saved in database successfully
-------Server Response Finished-------

Login to Website to Scrap Data and Send to Server another Example

Program

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re
import json
from bs4 import BeautifulSoup
import requests

print("Initializing credentials")
USERNAME = "email@example.com"
PASSWORD = "Admin123"

LOGIN_URL = "https://example-website.com/home?v=login"
TableURL = "https://example-website.com/views/fd34df34T"
SEND_DATA_URL = 'http://localhost/scraping/data'

print("Initializing drivers")
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])

options = Options()
options.headless = True
options.add_argument("--headless")
driver = webdriver.Chrome()
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
print("Loging in")
driver.get(LOGIN_URL)
driver.find_element(By.ID, "loginEmail").send_keys(USERNAME)
driver.find_element(By.ID, "formControl").click()
driver.find_element(By.ID, "loginPassword").send_keys(PASSWORD)
driver.find_element(By.ID, "formControl").click()

def startProcess():
    try:
        print("Waiting for data table to load")
        driver.get(TableURL)
        WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "(//div[contains(class="", 'grid-view-all')])"))).get_attribute(
            "outerHTML")
        print("Getting table source")
        soup = BeautifulSoup(driver.page_source, 'html.parser')
        print("Getting all rows from table")
        result = []
        for div in soup.find_all("div", class_="data-row"):
            order = div.find("div", attrs={"aria-colindex": "2"}).text
            load = div.find("div", attrs={"aria-colindex": "3"}).text
            col_dealer = div.find("div", attrs={"aria-colindex": "4"}).text
            pre_sold_customer = div.find("div", attrs={"aria-colindex": "5"}).text
            ship_to = div.find("div", attrs={"aria-colindex": "6"}).text
            color = div.find("div", attrs={"aria-colindex": "7"}).text
            status = div.find("div", attrs={"aria-colindex": "8"}).text
            vin = div.find("div", attrs={"aria-colindex": "9"}).text
            description = div.find("div", attrs={"aria-colindex": "10"}).text
            standard_features = div.find("div", attrs={"aria-colindex": "11"}).text
            add_ons = div.find("div", attrs={"aria-colindex": "12"}).text
            col_estimated_finish_date = div.find("div", attrs={"aria-colindex": "13"}).text
            col_gvwr = div.find("div", attrs={"aria-colindex": "14"}).text
            col_invoice = div.find("div", attrs={"aria-colindex": "15"}).text
            col_trailer_category = div.find("div", attrs={"aria-colindex": "16"}).text
            col_dealer_portal_access = div.find("div", attrs={"aria-colindex": "17"}).text

            item = {
                'order': re.sub(r"[\n\t]*", "", order.strip()),
                'status': re.sub(r"[\n]", " ", re.sub(r"[\t]*", "", status.strip())),
                'vin': re.sub(r"[\n\t]*", "", vin.strip()),
                'pre_sold_customer': re.sub(r"[\n\t]*", "", pre_sold_customer.strip()),
                'load': re.sub(r"[\n\t]*", "", load.strip()),
                'description': re.sub(r"[\n\t]*", "", description.strip()),
                'add_ons': re.sub(r"[\n\t]*", "", add_ons.strip()),
                'standard_features': re.sub(r"[\n\t]*", "", standard_features.strip()),
                'color': re.sub(r"[\n\t]*", "", color.strip()),
                'ship_to': re.sub(r"[\n\t]*", "", ship_to.strip()),
                'col_dealer': re.sub(r"[\n\t]*", "", col_dealer.strip()),
                'col_estimated_finish_date': re.sub(r"[\n\t]*", "", col_estimated_finish_date.strip()),
                'col_gvwr': re.sub(r"[\n\t]*", "", col_gvwr.strip()),
                'col_invoice': re.sub(r"[\n\t]*", "", col_invoice.strip()),
                'col_trailer_category': re.sub(r"[\n\t]*", "", col_trailer_category.strip()),
                'col_dealer_portal_access': re.sub(r"[\n\t]*", "", col_dealer_portal_access.strip())
            }
            result.append(item)
        sendData(json.dumps(result, ensure_ascii='False'))

        driver.quit()
    except TimeoutException:
        print("Exception occurred: Desired url was not rendered with in allocated time")
        print(driver.page_source)
        driver.quit()

def sendData(json):
    print(json)
    print("Sending data to server, and waiting for response. Please wait it can take a while")
    response = requests.post(SEND_DATA_URL, data={'json':json})
    print("-------Server Response-------")
    print(response.text)
    print("-------Server Response Finished-------")

startProcess()
Output
Initializing credentials
Initializing drivers
Loging in
Waiting for data table to load
Getting table source
Getting all rows from table
Sending data to server, and waiting for response. Please wait it can take a while
{'order': '....', 'status': '....', 'vin': '....', 'pre_sold_customer': '....', 'load': '....', 'description': '....', 'add_ons': '....', 'standard_features': '....', 'color': '....', 'ship_to': '....', 'col_dealer': '....', 'col_estimated_finish_date': '....', 'col_gvwr': '....', 'col_invoice': '....', 'col_trailer_category': '....', 'col_dealer_portal_access': '....'}
-------Server Response-------
All data processed and saved in database successfully
-------Server Response Finished-------

Note: The above tutorials are created for educational and learning purposes.