Creating a database to store case information of NYC S. typhi/paratyphi cases for monthly report to CDC. Fake data loaded in to database to query.

EER Diagram

SQL Code

Creating database, tables, and indexes

############################################################################
###                         FINAL PROJECT                                ###
###                    Allison Holmes - alh2233                          ###
############################################################################

# Create the schema
CREATE SCHEMA typhi_paratyphi;

USE typhi_paratyphi;


############################################################################
###                         CREATE TABLES                                ###
############################################################################
    # Note: "evt" is abbreviation used for case/patient IDs at NYC Department of Health and Mental Hygiene

##### CREATE PARENT TABLE ##################################################
    # Create Demographics table
CREATE TABLE demographics (
    PRIMARY KEY (evt),
    evt MEDIUMINT(7) UNSIGNED,
    dob DATE,
    state_of_residence VARCHAR(2),
    sex VARCHAR(1),
    food_handler TINYINT(1) UNSIGNED,
    date_sx_onset DATE,
    date_first_isolation DATE,
    serotype VARCHAR(9),
    international_travel TINYINT(1) UNSIGNED,
    hospitalized TINYINT(1) UNSIGNED,
    vaccinated TINYINT(1) UNSIGNED,
    part_of_outbreak TINYINT(1) UNSIGNED,
    linked_to_carrier TINYINT(1) UNSIGNED,
    known_carrier TINYINT(1) UNSIGNED
);

# Create INDEXES in Demographics table
CREATE INDEX sx_onset
    ON demographics(date_sx_onset);


##### CREATE LOOKUP TABLE ##################################################
    # Create Antibiotic table
CREATE TABLE antibiotics (
    PRIMARY KEY (med_id),
    med_id TINYINT(3) UNSIGNED,
    medication_name VARCHAR(100)
);


##### CREATE CHILD TABLE #1 ##################################################
    # Create Antibiotic Treatment table
CREATE TABLE antibiotic_treatment (
    PRIMARY KEY (evt, med_id, initiation_date),
    evt MEDIUMINT(7) UNSIGNED,
    med_id TINYINT(3) UNSIGNED,
    initiation_date DATE,
    admin_method VARCHAR(15),
    duration_days TINYINT(3) UNSIGNED,
    FOREIGN KEY (evt) REFERENCES demographics(evt),
    FOREIGN KEY (med_id) REFERENCES antibiotics(med_id)
);

# Create INDEXES in Antibiotic Treatment table
CREATE INDEX initiation_date
    ON antibiotic_treatment(initiation_date);

CREATE INDEX duration
    ON antibiotic_treatment(duration_days);
CREATE INDEX med_id
    ON antibiotic_treatment(med_id);


##### CREATE CHILD TABLE #2 ##################################################
    # Create Antibiotic Sensitivity table
CREATE TABLE antibiotic_sensitivity (
    PRIMARY KEY (lab_accession, med_id),
    evt MEDIUMINT(7) UNSIGNED,
    lab_accession MEDIUMINT(8) UNSIGNED,
    med_id TINYINT(3) UNSIGNED,
    specimen_date DATE,
    susceptibility VARCHAR(12),
    FOREIGN KEY (evt) REFERENCES demographics(evt),
    FOREIGN KEY (med_id) REFERENCES antibiotics(med_id)
);

# Create INDEXES in Antibiotic Sensitivity table
CREATE INDEX lab_accession
    ON antibiotic_sensitivity(lab_accession);
CREATE INDEX med_id
    ON antibiotic_sensitivity(med_id);

Creating triggers

############################################################################
###                         TRIGGERS                                     ###
############################################################################

# Create TRIGGERS in Demographics table
DELIMITER //

CREATE TRIGGER demo_triggers
    BEFORE INSERT ON demographics
    FOR EACH ROW
BEGIN
# Rule for male/female
    IF NEW.sex NOT IN("M","F") THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter sex assigned at birth as Male (M) or Female (F)';
    END IF;

# Rules that onset date, isolation date cannot be before DOB
    IF NEW.date_sx_onset < NEW.dob THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Symptom onset date is before birthdate';
    END IF;
    
    IF NEW.date_first_isolation < NEW.dob THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: First isolation date is before birthdate';
    END IF;

# Rule for serotype entry
    IF NEW.serotype NOT IN("Typhi","Paratyphi") THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Serotype must be typhoidal';
    END IF;

# Rules for all yes/no variables
    IF NEW.food_handler NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF; 
    
    IF NEW.international_travel NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF;  
    
    IF NEW.hospitalized NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF;      

    IF NEW.vaccinated NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF; 

    IF NEW.part_of_outbreak NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF; 

    IF NEW.linked_to_carrier NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF; 

    IF NEW.known_carrier NOT IN(1,0) THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter 1 for yes, 0 for no';
    END IF;
    
END //

# Create TRIGGERS in Antibiotic Treatment table
DELIMITER //

CREATE TRIGGER treatment_triggers
    BEFORE INSERT ON antibiotic_treatment
    FOR EACH ROW
BEGIN
    # Rule for medication administration labels
    IF NEW.admin_method NOT IN('IM','IV','Oral','Other') THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter as IM, IV, Oral, or Other';
    END IF;  

END //

# Create TRIGGERS in Antibiotic Sensitivity table
DELIMITER //

CREATE TRIGGER sensitivity_triggers
    BEFORE INSERT ON antibiotic_sensitivity
    FOR EACH ROW
BEGIN
    # Rule for susceptibility labels
    IF NEW.susceptibility NOT IN('Susceptible','Intermediate','Resistant') THEN
        SIGNAL SQLSTATE 'HY000'
        SET MESSAGE_TEXT = 'Error: Please enter as Susceptible, Intermediate, or Resistant';
    END IF;  

END //

Entering fake data into tables, importing CSV data

############################################################################
###                  ENTER DATA INTO TABLES                              ###
############################################################################

##### ANTIBIOTIC LOOKUP TABLE #############################################
    # My computer needs this setting in order to import CSVs
    SET GLOBAL local_infile = ON;

# Import antibiotic lookup table from CSV
LOAD DATA LOCAL INFILE 'Datasets/alh2233_lookup_data copy.csv'
INTO TABLE antibiotics
FIELDS TERMINATED BY ','
LINES TERMINATED BY '\n'
IGNORE 1 ROWS
(med_id, medication_name);

##### DEMOGRAPHICS TABLE ###################################################
INSERT INTO demographics (evt, dob, state_of_residence, sex, food_handler, date_sx_onset, date_first_isolation,
    serotype, international_travel, hospitalized, vaccinated, part_of_outbreak,
    linked_to_carrier, known_carrier)
    VALUES
    (1000000, '2000-01-01', 'NY', 'M', 0, '2022-07-01', '2022-07-20', 'Typhi', 1, 1, 0, 0, 0, 0),
    (1100000, '1996-05-01', 'NY', 'M', 1, '2022-08-10', '2022-08-15', 'Paratyphi', 1, 1, 0, 0, 0, 0),
    (1200000, '1998-10-30', 'NY', 'M', 0, '2022-08-21', '2022-08-30', 'Typhi', 1, 1, 0, 0, 0, 0),
    (6200000, '2010-12-01', 'NY', 'M', 0, '2022-09-01', '2022-09-07', 'Typhi', 0, 1, 0, 0, 1, 0),
    (1400000, '2017-01-21', 'NY', 'F', 0, '2022-09-06', '2022-09-10', 'Typhi', 1, 1, 0, 0, 0, 0),
    (1500000, '2009-10-01', 'NY', 'M', 0, '2022-09-10', '2022-09-15', 'Paratyphi', 1, 1, 0, 0, 0, 0),
    (6100000, '1980-06-12', 'NY', 'M', 0, '2022-09-15', '2022-09-27', 'Typhi', 0, 0, 0, 0, 0, 1),
    (2000000, '2015-05-11', 'NY', 'M', 0, '2022-09-23', '2022-09-30', 'Typhi', 1, 1, 0, 1, 0, 0),
    (2100000, '2018-02-25', 'NY', 'F', 0, '2022-10-01', '2022-10-31', 'Typhi', 1, 1, 0, 1, 0, 0),
    (2200000, '2006-03-07', 'NY', 'F', 0, '2022-11-01', '2022-11-10', 'Typhi', 1, 1, 0, 1, 0, 0);


##### ANTIBIOTIC TREATMENT TABLE ##########################################
INSERT INTO antibiotic_treatment (evt, med_id, initiation_date, admin_method, duration_days)
    VALUES
    (1000000, 70, '2022-07-20', 'IV', 12),
    (1100000, 70, '2022-08-15', 'IV', 4),    
    (1200000, 132, '2022-08-30', 'IV', 5),
    (6200000, 70, '2022-09-07', 'IV', 3),
    (6200000, 70, '2022-09-27', 'IV', 15),
    (1400000, 121, '2022-09-10', 'IV', 12),
    (1500000, 132, '2022-09-15', 'IV', 3),
    (1500000, 70, '2022-09-15', 'IV', 7),
    (2000000, 121,'2022-09-30', 'IV', 3),
    (2000000, 70,'2022-09-30', 'IV', 3),
    (2000000, 132,'2022-09-30', 'IV', 3),
    (2000000, 100,'2022-09-30', 'IV', 3),
    (2000000, 16,'2022-10-02', 'IV', 20),
    (2100000, 16,'2022-11-03', 'IV', 15),
    (2100000, 70, '2022-11-03', 'IV', 2),
    (2200000, 16,'2022-11-12', 'IV', 15);


##### ANTIBIOTIC SENSITIVITY TABLE ########################################
INSERT INTO antibiotic_sensitivity (evt, lab_accession, med_id, specimen_date, susceptibility)
    VALUES
    (1000000, 10101010, 70, '2022-07-20', 'Susceptible'),
    (1100000, 11111111, 70, '2022-08-15', 'Susceptible'),
    (1200000, 12111111, 132, '2022-08-30', 'Susceptible'),
    (1400000, 14141414, 121, '2022-09-10', 'Susceptible'),
    (1500000, 15151515, 121, '2022-09-15', 'Resistant'),
    (1500000, 15151515, 132, '2022-09-15', 'Intermediate'),
    (1500000, 15151515, 70, '2022-09-15', 'Susceptible'),
    (6100000, 16000000, 121, '2022-09-27', 'Resistant'),
    (6100000, 16000000, 70, '2022-09-27', 'Susceptible'),
    (2000000, 12121212, 121,'2022-09-30', 'Resistant'),
    (2000000, 12121212, 70,'2022-09-30', 'Resistant'),
    (2000000, 12121212, 132,'2022-09-30', 'Resistant'),
    (2000000, 12121212, 100,'2022-09-30', 'Resistant'),
    (2000000, 12232323, 16,'2022-10-02', 'Susceptible'),
    (2100000, 11212121, 121, '2022-10-31', 'Resistant'),
    (2100000, 11212121, 132, '2022-10-31', 'Resistant'),
    (2100000, 11212121, 100, '2022-10-31', 'Resistant'),
    (2100000, 11212121, 16, '2022-10-31', 'Susceptible'),
    (2200000, 12222222, 121, '2022-11-10', 'Resistant'),
    (2200000, 12222222, 132, '2022-11-10', 'Resistant'),
    (2200000, 12222222, 100, '2022-11-10', 'Resistant'),
    (2200000, 12222222, 16, '2022-11-10', 'Susceptible');

Writing queries

############################################################################
###                         QUERIES                                      ###
############################################################################

### Query #1 ###############################################################
CREATE VIEW sensitivity_names AS
SELECT se.evt, se.lab_accession, a.medication_name, se.susceptibility
FROM antibiotic_sensitivity AS se
    INNER JOIN antibiotics AS a
    USING (med_id);
    
/*  This view is compiling information for each patient's antibiotic susceptibility testing in 
    a readable format (joining medication names rather than numeric codes) that can be quickly
    queried to see which antibiotics a patient had sensitivity/susceptibility testing for.
*/


### Query #2 ###############################################################
CREATE TEMPORARY TABLE antibiotics_used AS
SELECT tx.evt, tx.med_id, a.medication_name, tx.initiation_date, se.specimen_date, se.susceptibility
FROM antibiotic_treatment AS tx
    LEFT JOIN antibiotic_sensitivity AS se
    USING (evt, med_id)
    
    LEFT JOIN antibiotics AS a
    USING (med_id);

SELECT evt, medication_name
FROM antibiotics_used
WHERE susceptibility IS NULL;

/* Where any antibiotics used that were not tested for susceptibility?
    Yes, Ampicillin was used to treat 2 patients without testing for susceptibility.

The temporary table displays all antibiotics used to treat cases and left joins to the antibiotics that
underwent susceptibility testing and the antibiotic names.
The query then filters this temporary table to results where susceptibility was not performed to answer the question.
*/


### Query #3 ###############################################################
WITH all_antibiotics AS
(
    SELECT d.evt, COUNT(DISTINCT(tx.med_id)) AS treatments, COUNT(DISTINCT(se.med_id)) AS susc_tested
    FROM demographics AS d
        INNER JOIN antibiotic_treatment AS tx
        USING (evt)
        
        INNER JOIN antibiotic_sensitivity AS se
        USING (evt)
    GROUP BY evt
)

SELECT ROUND(AVG(treatments), 2) AS avg_tx, ROUND(AVG(susc_tested), 2) AS avg_susc_tested
FROM all_antibiotics;

/*
What is the average number of antibiotic treatments all patients receive and the average number
of antibiotic susceptibilities tested on all patients?
    Average treatments: 1.75
    Average susceptibility tests: 2.50
CTE displays the count of the total number of medications and tests each patients received.
The query then takes the average of these numbers.
    Note: I used demographics table for the EVT and a left join in case a patient has not received any treatment/sensitivity testing.
*/


### Query #4 ###############################################################
CREATE TABLE case_data_by_month AS
SELECT YEAR(date_first_isolation) AS year, MONTH(date_first_isolation) AS month, COUNT(evt) AS total_cases, 
        COUNT(IF(serotype='Typhi', evt, NULL)) AS typ_cases, COUNT(IF(serotype='Paratyphi', evt, NULL)) AS pty_cases, 
        COUNT(IF(sex='M', evt, NULL)) AS num_male, COUNT(IF(sex='F', evt, NULL)) AS num_female,
        SUM(food_handler) AS num_foodhandlers, SUM(international_travel) AS num_traveled, SUM(hospitalized) AS num_hospitalized,
        SUM(vaccinated) AS num_vaccinated, SUM(part_of_outbreak) AS outbreak_cases, SUM(linked_to_carrier) AS secondary_cases, 
        SUM(known_carrier) AS carriers
FROM demographics
GROUP BY year, month
ORDER BY year, month;

/*
This table represents data pivoted from individual-level in the demographics table (long) to 
population-level data month-over-month with monthly totals for each category of data collected (wide).
    Although this is not the exact syntax we used to pivot tables in class, I could not think of another useful
    way to use this data in a "wide" format.
This table could be used to analyze trends across all patients in NY diagnosed with typhoid/paratyphoid fever.
*/


### Query #5 ###############################################################
SELECT a.evt, a.med_id, a.initiation_date AS initiation_1, b.initiation_date AS initiation_2
FROM antibiotic_treatment AS a
    INNER JOIN antibiotic_treatment AS b
    USING (evt, med_id)
WHERE a.med_id = b.med_id AND a.initiation_date < b.initiation_date;

/*
Are there any patients who initiated the same antibiotic treatment on two dates?
    Yes, there is one patient who initiatied med_id 70 on 9/7/2022 and then again on 9/27/2022
    
This query shows the multiple initiation dates of a patient's treatment drug in a single row.  
*/


### Query #6 ###############################################################
SELECT evt, COUNT(DISTINCT(med_id)) AS num_tx
FROM antibiotic_treatment
GROUP BY evt
HAVING num_tx = (
    SELECT COUNT(DISTINCT(med_id)) AS num_tx
    FROM antibiotic_treatment
    GROUP BY evt
    ORDER BY num_tx DESC
    LIMIT 1);
    
/*
Which patient(s) received the most different antibiotics as part of their treatment
and how many antibiotics?
    EVT 2000000 received the most number of antibiotics, having received 5 over the course of treatment.
This query will result with a list of EVTs and the number of treatments for any patient
with the to the maximum number of treatments (the maximum is the result of the subquery).
*/


### Query #7 ###############################################################
SELECT evt, tx.med_id AS used_treatment, se.med_id AS sensitivity_tested
FROM antibiotic_treatment AS tx
    LEFT JOIN antibiotic_sensitivity AS se
    USING (evt, med_id)
    
UNION

SELECT evt, tx.med_id AS used_treatment, se.med_id AS sensitivity_tested
FROM antibiotic_treatment AS tx
    RIGHT JOIN antibiotic_sensitivity AS se
    USING (evt, med_id);

/* Results display a full join of all patients who have received antibiotics and all
patients who have received antibiotic susceptibility testing. In theory, this database
could be used to store data beyond just typhoid/paratyphoid cases. Susceptibility testing
may be conducted any time a bacteria is cultured. Conversely, antibiotics may be given as
treatment without susceptibility testing. This table captures all these cases and those cases
who have only completed susceptibility testing OR received antibiotics, but not both.

UNION is used instead of UNION ALL since we do not want duplicate medications given to one case to be displayed.
*/


### Query #8 ###############################################################
SELECT *, COUNT(IF(susceptibility = 'Resistant', med_id, NULL)) OVER(PARTITION BY evt) AS num_resistant,
    CASE
        WHEN COUNT(IF(susceptibility = 'Resistant', med_id, NULL)) OVER(PARTITION BY evt) > 10 THEN 'XDR'
        WHEN COUNT(IF(susceptibility = 'Resistant', med_id, NULL)) OVER(PARTITION BY evt) > 2 THEN 'MDR'
        WHEN COUNT(IF(susceptibility = 'Resistant', med_id, NULL)) OVER(PARTITION BY evt) > 0 THEN 'Some Resistance'
        WHEN COUNT(IF(susceptibility = 'Resistant', med_id, NULL)) OVER(PARTITION BY evt) = 0 THEN 'Not Resistant'
    END AS resistance_category
FROM antibiotic_sensitivity;

/* Add an aggregated column to the antibiotic_sensitivity table displaying the total number of antibiotics each case shows resistance
to and categorize the resistance into Not Resistant, Some Resistance, MDR, and XDR using numerical cutoffs.
I used OVER(PARTITION BY) since I want the resistance category to be displayed by case EVT.
*/


### Query #9 ###############################################################
SELECT evt, num_resistant, DENSE_RANK() OVER(ORDER BY num_resistant DESC) AS resistance_rank
FROM 
    (SELECT *, COUNT(IF(susceptibility = 'Resistant', med_id, NULL)) OVER(PARTITION BY evt) AS num_resistant
    FROM antibiotic_sensitivity) AS res_totals
GROUP BY evt, num_resistant;

/* Ranks patients from most to least number of antibiotics they are resistant to.
I used as DENSE_RANK() in place of a RANK() since I do not want to skip rank values.
*/


### Query #10 ###############################################################
SELECT a.medication_name, COUNT(evt) AS num_administered
FROM demographics as d
    LEFT JOIN antibiotic_treatment AS tx
    USING (evt)
    
    LEFT JOIN antibiotics AS a
    USING (med_id)
GROUP BY medication_name
ORDER BY num_administered DESC
LIMIT 1;

/* What is the most commonly used medication to treat typhoid/paratyphoid fever in this dataset
    and how many times was it used?
    
    Ampicillin/sulbactam, 7 times

*/