In the following tutorials you will learn that how to login to website to create session to navigation, and scrap specific data and send to server in JSON format using Python programming language.
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re
import json
from bs4 import BeautifulSoup
import requests
print("Initializing credentials")
USERNAME = "email@example.com"
PASSWORD = "Admin123"
LOGIN_URL = "https://example-website.com/home?v=login"
TableURL = "https://example-website.com/views/343df34f"
SEND_DATA_URL = 'http://localhost/scraping/data'
print("Initializing drivers")
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options = Options()
options.headless = True
options.add_argument("--headless")
driver = webdriver.Chrome()
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
print("Loging in")
driver.get(LOGIN_URL)
driver.find_element(By.ID, "loginEmail").send_keys(USERNAME)
driver.find_element(By.ID, "formControl").click()
driver.find_element(By.ID, "loginPassword").send_keys(PASSWORD)
driver.find_element(By.ID, "formControl").click()
def startProcess():
try:
print("Waiting for data table to load")
driver.get(TableURL)
# driver.implicitly_wait(30)
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, '(//table)[1]'))).get_attribute(
"outerHTML")
print("Getting table source")
index = 0
result = []
soup = BeautifulSoup(driver.page_source, "lxml")
print("Getting all rows from table")
for table in soup.find_all('table'):
if (index == 1):
for row in table.find_all('tr'):
columns = row.find_all('td')
order = columns[1]
status = columns[2]
vin = columns[3]
pre_sold_customer = columns[4]
load = columns[5]
description = columns[6]
add_ons = columns[7]
standard_features = columns[8]
color = columns[9]
ship_to = columns[10]
item = {
'order': re.sub(r"[\n\t]*", "", order.get_text().strip()),
'status': re.sub(r"[\n]", " ", re.sub(r"[\t]*", "", status.get_text().strip())),
'vin': re.sub(r"[\n\t]*", "", vin.get_text().strip()),
'pre_sold_customer': re.sub(r"[\n\t]*", "", pre_sold_customer.get_text().strip()),
'load': re.sub(r"[\n\t]*", "", load.get_text().strip()),
'description': re.sub(r"[\n\t]*", "", description.get_text().strip()),
'add_ons': re.sub(r"[\n\t]*", "", add_ons.get_text().strip()),
'standard_features': re.sub(r"[\n\t]*", "", standard_features.get_text().strip()),
'color': re.sub(r"[\n\t]*", "", color.get_text().strip()),
'ship_to': re.sub(r"[\n\t]*", "", ship_to.get_text().strip())
}
result.append(item)
index += 1
sendData(json.dumps(result, ensure_ascii='False'))
driver.quit()
except TimeoutException:
print("Exception occurred: Desired url was not rendered with in allocated time")
print(driver.page_source)
driver.quit()
def sendData(json):
print("Sending data to server, and waiting for response. Please wait it can take a while")
print(json)
# response = requests.post(SEND_DATA_URL, data={'json':json})
print("-------Server Response-------")
# print(response.text)
print("-------Server Response Finished-------")
startProcess()
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.service import Service
from webdriver_manager.chrome import ChromeDriverManager
import re
import json
from bs4 import BeautifulSoup
import requests
print("Initializing credentials")
USERNAME = "email@example.com"
PASSWORD = "Admin123"
LOGIN_URL = "https://example-website.com/home?v=login"
TableURL = "https://example-website.com/views/fd34df34T"
SEND_DATA_URL = 'http://localhost/scraping/data'
print("Initializing drivers")
options = webdriver.ChromeOptions()
options.add_experimental_option('excludeSwitches', ['enable-logging'])
options = Options()
options.headless = True
options.add_argument("--headless")
driver = webdriver.Chrome()
#driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
print("Loging in")
driver.get(LOGIN_URL)
driver.find_element(By.ID, "loginEmail").send_keys(USERNAME)
driver.find_element(By.ID, "formControl").click()
driver.find_element(By.ID, "loginPassword").send_keys(PASSWORD)
driver.find_element(By.ID, "formControl").click()
def startProcess():
try:
print("Waiting for data table to load")
driver.get(TableURL)
WebDriverWait(driver, 30).until(EC.visibility_of_element_located((By.XPATH, "(//div[contains(class="", 'grid-view-all')])"))).get_attribute(
"outerHTML")
print("Getting table source")
soup = BeautifulSoup(driver.page_source, 'html.parser')
print("Getting all rows from table")
result = []
for div in soup.find_all("div", class_="data-row"):
order = div.find("div", attrs={"aria-colindex": "2"}).text
load = div.find("div", attrs={"aria-colindex": "3"}).text
col_dealer = div.find("div", attrs={"aria-colindex": "4"}).text
pre_sold_customer = div.find("div", attrs={"aria-colindex": "5"}).text
ship_to = div.find("div", attrs={"aria-colindex": "6"}).text
color = div.find("div", attrs={"aria-colindex": "7"}).text
status = div.find("div", attrs={"aria-colindex": "8"}).text
vin = div.find("div", attrs={"aria-colindex": "9"}).text
description = div.find("div", attrs={"aria-colindex": "10"}).text
standard_features = div.find("div", attrs={"aria-colindex": "11"}).text
add_ons = div.find("div", attrs={"aria-colindex": "12"}).text
col_estimated_finish_date = div.find("div", attrs={"aria-colindex": "13"}).text
col_gvwr = div.find("div", attrs={"aria-colindex": "14"}).text
col_invoice = div.find("div", attrs={"aria-colindex": "15"}).text
col_trailer_category = div.find("div", attrs={"aria-colindex": "16"}).text
col_dealer_portal_access = div.find("div", attrs={"aria-colindex": "17"}).text
item = {
'order': re.sub(r"[\n\t]*", "", order.strip()),
'status': re.sub(r"[\n]", " ", re.sub(r"[\t]*", "", status.strip())),
'vin': re.sub(r"[\n\t]*", "", vin.strip()),
'pre_sold_customer': re.sub(r"[\n\t]*", "", pre_sold_customer.strip()),
'load': re.sub(r"[\n\t]*", "", load.strip()),
'description': re.sub(r"[\n\t]*", "", description.strip()),
'add_ons': re.sub(r"[\n\t]*", "", add_ons.strip()),
'standard_features': re.sub(r"[\n\t]*", "", standard_features.strip()),
'color': re.sub(r"[\n\t]*", "", color.strip()),
'ship_to': re.sub(r"[\n\t]*", "", ship_to.strip()),
'col_dealer': re.sub(r"[\n\t]*", "", col_dealer.strip()),
'col_estimated_finish_date': re.sub(r"[\n\t]*", "", col_estimated_finish_date.strip()),
'col_gvwr': re.sub(r"[\n\t]*", "", col_gvwr.strip()),
'col_invoice': re.sub(r"[\n\t]*", "", col_invoice.strip()),
'col_trailer_category': re.sub(r"[\n\t]*", "", col_trailer_category.strip()),
'col_dealer_portal_access': re.sub(r"[\n\t]*", "", col_dealer_portal_access.strip())
}
result.append(item)
sendData(json.dumps(result, ensure_ascii='False'))
driver.quit()
except TimeoutException:
print("Exception occurred: Desired url was not rendered with in allocated time")
print(driver.page_source)
driver.quit()
def sendData(json):
print(json)
print("Sending data to server, and waiting for response. Please wait it can take a while")
response = requests.post(SEND_DATA_URL, data={'json':json})
print("-------Server Response-------")
print(response.text)
print("-------Server Response Finished-------")
startProcess()
Note: The above tutorials are created for educational and learning purposes.