Nashville Housing Data Cleanup and Transformation
- Ekramul Haque Towsif
- Sep 15, 2024
- 2 min read
This SQL script contains queries that are used for cleaning and transforming the Nashville housing dataset. This includes tasks for getting rid of duplicates, adding, dropping new columns, and carrying out data transformations to prepare the dataset for the analytical stage.
Link to Code:
Let's have a look at the data first:
Next, we will split the PropertyAddress and OwnerAddress fields into separate columns for Address, City, and State.
SELECT PropertyAddress,
CASE
WHEN LOCATE(',', PropertyAddress) > 0
THEN SUBSTRING(PropertyAddress, 1, LOCATE(', ', PropertyAddress) - 1)
ELSE PropertyAddress
END AS Address,
CASE
WHEN LOCATE(',', PropertyAddress) > 0
THEN SUBSTRING(PropertyAddress, LOCATE(', ', PropertyAddress) + 1, length(PropertyAddress))
ELSE PropertyAddress
END AS Address
FROM nashville_housing;alter table nashville_housing
add PropertySplitAddress nvarchar(255);
update nashville_housing
set PropertySplitAddress = SUBSTRING(PropertyAddress, 1, LOCATE(', ', PropertyAddress) - 1);
alter table nashville_housing
add PropertySplitCity nvarchar(255);
update nashville_housing
set PropertySplitCity = SUBSTRING(PropertyAddress, LOCATE(', ', PropertyAddress) + 1, length(PropertyAddress));output:
Next, we converted values in the SoldAsVacant field from 'Y' and 'N' to 'Yes' and 'No'. And we will also update the table:
select SoldAsVacant,
case
when SoldAsVacant = 'Y' then 'Yes'
when SoldAsVacant = 'N' then 'No'
else SoldAsVacant
end
from nashville_housing;update nashville_housing
set SoldAsVacant = case
when SoldAsVacant = 'Y' then 'Yes'
when SoldAsVacant = 'N' then 'No'
else SoldAsVacant
end;
set SoldAsVacant = case
when SoldAsVacant = 'Y' then 'Yes'
when SoldAsVacant = 'N' then 'No'
else SoldAsVacant
end;
set SoldAsVacant = case
when SoldAsVacant = 'Y' then 'Yes'
when SoldAsVacant = 'N' then 'No'
else SoldAsVacant
end;We identified and remove duplicate rows while keeping only the first occurrence based on specific criteria.
CREATE TEMPORARY TABLE temp_table AS
SELECT
*
FROM (
SELECT
*,
ROW_NUMBER() OVER (
PARTITION BY ParcelID, SaleDate, SalePrice, LegalReference
ORDER BY UniqueID
) AS row_num
FROM nashville_housing
) AS subquery
WHERE row_num = 1;
TRUNCATE TABLE nashville_housing;
ALTER TABLE nashville_housing
ADD COLUMN row_num INT;
INSERT INTO nashville_housing
SELECT * FROM temp_table;
ALTER TABLE nashville_housing
DROP COLUMN row_num;Finally, removed columns that were no longer needed to simplify the dataset.
select *
from nashville_housing;
alter table nashville_housing
drop column OwnerAddress,
drop column TaxDistrict,
drop column PropertyAddress;






Comments