In the following tutorial you will learn that how to scrap a specific data from any website by category and page number using Python programming language.
from lxml import html
import requests
import re
import mysql.connector
from html import escape
cnx = mysql.connector.connect(user='root', password='',
host='127.0.0.1',
database='python_db')
cursor = cnx.cursor()
def index_exists(ls, i):
return (0 <= i < len(ls)) or (-len(ls) <= i < 0)
def executeProcess(category, pageNumber):
page = requests.get('https://www.example-website.com/' + str(category) + '?p=' + str(pageNumber))
print('Scraping URL: https://www.example-website.com/' + str(category) + '?p=' + str(pageNumber))
tree = html.fromstring(page.content)
productName = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//a[class=""="product-item-link"]/text()')
productPrice = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//span[class=""="price"]/text()')
productSku = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//span[class=""="skulist col-sm-4"]/text()')
productImage = tree.xpath('//li[class=""="item product product-item"]//div[class=""="box-image"]//span[class=""="product-image-container"]//img[class=""="product-image-photo lazyload"]//@data-src')
productUrl = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//h2[class=""="product name product-item-name product-name"]//a[class=""="product-item-link"]//@href')
limit = 0
for arrayRange in productName:
limit = limit + 1
name = ""
price = ""
sku = ""
image = ""
url = ""
for indexNumber in range(limit):
if index_exists(productName, indexNumber):
name = productName[indexNumber]
else:
name = "NA"
if index_exists(productPrice, indexNumber):
price = productPrice[indexNumber]
else:
price = "NA"
if index_exists(productSku, indexNumber):
sku = productSku[indexNumber]
else:
sku = "NA"
if index_exists(productImage, indexNumber):
image = productImage[indexNumber]
else:
image = "NA"
if index_exists(productUrl, indexNumber):
url = productUrl[indexNumber]
else:
url = "NA"
cursor.execute("INSERT INTO data(name, price, sku, image, url)VALUES('" + escape(name) + "','" + escape(price) + "','" + escape(sku) + "','" + image + "','" + url + "')")
productName.clear()
productPrice.clear()
productSku.clear()
productImage.clear()
productUrl.clear()
pageNumber = pageNumber + 1
if (pageNumber < 10):
executeProcess(category, pageNumber)
else:
print("Category: "+str(category)+" Page Number: "+str(pageNumber)+" Completed.")
executeProcess("hardware-tools", 1)
print("All data is saved in MySQL database")
Note: The above tutorials are created for educational and learning purposes.