Scraping by Category and Page Number in Python

In the following tutorial you will learn that how to scrap a specific data from any website by category and page number using Python programming language.

Scraping Data by Category and Page Number

Program

from lxml import html
import requests
import re
import mysql.connector
from html import escape

cnx = mysql.connector.connect(user='root', password='',
                              host='127.0.0.1',
                              database='python_db')
cursor = cnx.cursor()

def index_exists(ls, i):
    return (0 <= i < len(ls)) or (-len(ls) <= i < 0)

def executeProcess(category, pageNumber):
    page = requests.get('https://www.example-website.com/' + str(category) + '?p=' + str(pageNumber))
    print('Scraping URL: https://www.example-website.com/' + str(category) + '?p=' + str(pageNumber))
    tree = html.fromstring(page.content)

    productName = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//a[class=""="product-item-link"]/text()')
    productPrice = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//span[class=""="price"]/text()')
    productSku = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//span[class=""="skulist  col-sm-4"]/text()')
    productImage = tree.xpath('//li[class=""="item product product-item"]//div[class=""="box-image"]//span[class=""="product-image-container"]//img[class=""="product-image-photo lazyload"]//@data-src')
    productUrl = tree.xpath('//li[class=""="item product product-item"]//div[class=""="product details product-item-details box-info"]//h2[class=""="product name product-item-name product-name"]//a[class=""="product-item-link"]//@href')

    limit = 0
    for arrayRange in productName:
        limit = limit + 1

    name = ""
    price = ""
    sku = ""
    image = ""
    url = ""

    for indexNumber in range(limit):
        if index_exists(productName, indexNumber):
            name = productName[indexNumber]
        else:
            name = "NA"
        if index_exists(productPrice, indexNumber):
            price = productPrice[indexNumber]
        else:
            price = "NA"
        if index_exists(productSku, indexNumber):
            sku = productSku[indexNumber]
        else:
            sku = "NA"
        if index_exists(productImage, indexNumber):
            image = productImage[indexNumber]
        else:
            image = "NA"
        if index_exists(productUrl, indexNumber):
            url = productUrl[indexNumber]
        else:
            url = "NA"

        cursor.execute("INSERT INTO data(name, price, sku, image, url)VALUES('" + escape(name) + "','" + escape(price) + "','" + escape(sku) + "','" + image + "','" + url + "')")

    productName.clear()
    productPrice.clear()
    productSku.clear()
    productImage.clear()
    productUrl.clear()
    pageNumber = pageNumber + 1
    if (pageNumber < 10):
        executeProcess(category, pageNumber)
    else:
        print("Category: "+str(category)+" Page Number: "+str(pageNumber)+" Completed.")

executeProcess("hardware-tools", 1)
print("All data is saved in MySQL database")
Output
Category: hardware-tools and Page Number: 10 Completed.
All data is saved in MySQL database

Note: The above tutorials are created for educational and learning purposes.