WB Project PDOs Data Preprocessing

Author

Luisa M. Mimmi

Warning

WORK IN PROGRESS! (Please expect unfinished sections, and unpolished code. Feedback is welcome!)

Set up

# Pckgs -------------------------------------
library(fs) # Cross-Platform File System Operations Based on 'libuv'
library(tidyverse) # Easily Install and Load the 'Tidyverse'
library(janitor) # Simple Tools for Examining and Cleaning Dirty Data
library(skimr) # Compact and Flexible Summaries of Data
library(here) # A Simpler Way to Find Your Files
library(paint) # paint data.frames summaries in colour
library(readxl) # Read Excel Files
library(tidytext) # Text Mining using 'dplyr', 'ggplot2', and Other Tidy Tools
library(SnowballC) # Snowball Stemmers Based on the C 'libstemmer' UTF-8 Library
library(rsample) # General Resampling Infrastructure
library(rvest) # Easily Harvest (Scrape) Web Pages
library(cleanNLP) # A Tidy Data Model for Natural Language Processing #
library(rlang) # Functions for Base Types and Core R and 'Tidyverse' Features
library(kableExtra) # Construct Complex Table with 'kable' and Pipe Syntax
# non scientific notation 
options(scipen=999)
# load functions
source(here::here("R", "f_recap_values.R")) # load functions

Data sources

_______

LOADING FILES

WB Projects & Operations

World Bank Projects & Operations can be explored at:

The Accessibility Classification is public under Creative Commons Attribution 4.0

Process to ingest & preprocess raw PDO text data

  1. Retrieve manually ALL WB projects (22,569) listed (approval obtained or requested between FY 1947 and 2026 as of 31/08/2024) using the Excel button on this page: WBG Projects
  2. Split the dataset and keep only projs_train (50% of projects with PDO text, i.e. 4413 PDOs)
  3. Clean the projs_train dataset
  4. Obtain PoS tagging + tokenization with cleanNLP package (functions cnlp_init_udpipe() + cnlp_annotate()) and saved projs_train_t (cleaned version of projs_train dataset).

I had made a prior attempt to ingest the WB document data using the rvest package, following the work of Stephanie Tran project who created the function R/f_scrape_WB-OKR.R. However the WB website has changed and the function is not working anymore.

✅ Ingest Projects data (manually from *.xlsx file)

# Load the data
all_projects_as_of29ago2024 <- readxl::read_excel(here::here (
   "data", "raw_data", "project2","all_projects_as_of29ago2024.xls"), 
   col_names = FALSE,
   skip = 1) 

# Column names only
cnames <- read_excel(here::here("data", "raw_data", "project2", 
                                "all_projects_as_of29ago2024.xls"), 
                     col_names = FALSE,
                     skip = 1,
                     n_max = 2) 
# Complete file
all_proj <- read_excel(here::here("data", "raw_data", "project2", 
                                  "all_projects_as_of29ago2024.xls"), 
                         col_names = TRUE,
                         skip = 2) 
# Save as .RDS 
save(all_proj, file = here::here("data", "raw_data", "project2", 
                                 "all_projects_as_of29ago2024.Rdata") ) 
rm(all_projects_as_of29ago2024)

Explore Project mega file

Checking the completeness of the columns for key information.

paint::paint(cnames)
paint::paint(all_proj)

skimr::skim(all_proj$id) # complete_rate 1 = 100%
skimr::skim(all_proj$pdo)  # complete_rate 0.503
skimr::skim(all_proj$boardapprovaldate) # complete_rate 0.779
skimr::skim(all_proj$closingdate) # complete_rate 0.707 

Recode dates all_proj

This data set has a lot of blank values, probably also bc some information was not collected way back in 1947… (e.g. PDO)

# Mess of data format weird in different ways in 2 cols: 
    # 1947-12-31 12:00:00 # closingdate
    # 8/3/1948 12:00:00 AM # closingdate
    # 1955-03-15T00:00:00Z # boardapprovaldate
 
# List of possible inconsistent date formats
common_date_formats <- c(
  "ymd HMS", "ymd HM", "ymd",           # ISO: 2023-03-15 12:30:00
  "ymd_HMS", "ymd_HM", "ymd T",         # ISO with underscore/T separator
  "mdy HMS", "mdy HM",                  # U.S. style with AM/PM
  "mdy HMSp",                           # "MM/DD/YYYY HH:MM AM/PM"
  "dmy HMS", "dmy HM",                  # European style
  "Ymd HMS", "BdY IMS p",               # Weird but possible
  "Y-m-d", "m/d/Y", "d/m/Y"             # Fallback basics
)

# set date columns
date_columns_exe <- c("closingdate", "boardapprovaldate")

# function to parse messy, inconsistent date formats for list of date col 
f_parse_date <- function(data, date_columns) {
  data %>%
    mutate(across(
      .cols = all_of(date_columns),
      .fns = ~ if_else(
        .x == "" | is.na(.x),
        as.POSIXct(NA),
        parse_date_time(.x, orders = common_date_formats)
      )
    ))
}


# Mutate the date columns to parse the dates, handling different formats and blanks
all_proj_temp <- all_proj %>%
   # CALL my function !
   f_parse_date(date_columns = date_columns_exe) %>%
   # SPLIT date into year and month
   mutate(boardapproval_year = year(boardapprovaldate),
          boardapproval_month = month(boardapprovaldate)) %>% 
   mutate(boardapprovalFY = case_when( 
      boardapproval_month >= 1 & boardapproval_month < 7 ~ boardapproval_year,
      boardapproval_month >= 7 & boardapproval_month <= 12 ~ boardapproval_year +1)) %>% 
   relocate(boardapprovalFY, .after = boardapprovaldate ) %>% 
   mutate(closingdate_year = year(closingdate),
          closingdate_month = month(closingdate)) %>% 
   mutate(closingdateFY = case_when( 
      closingdate_month >= 1 & closingdate_month < 7 ~ closingdate_year,
      closingdate_month >= 7 & closingdate_month <= 12 ~ closingdate_year +1)) %>% 
   relocate(closingdateFY, .after = closingdate ) 

# Check the results
tabyl(all_proj$closingdate)   
tabyl(all_proj_temp$closingdateFY)

tabyl(all_proj$boardapprovaldate)   
tabyl(all_proj_temp$boardapprovalFY)

skimr::skim(all_proj_temp$boardapprovalFY) # complete_rate 0.779
skimr::skim(all_proj_temp$closingdateFY) # complete_rate 0.707
skimr::skim(all_proj_temp$pdo) # complete_rate 0.707

[Later (2025) data dump ]

On March 31, 2025, I retrieved the same dataset from the WB website. The dataset now carries additional sheets (“Themes”, “Sectors”, “GEO locations” and “Financiers”) which I retrieve and save as all_proj_themes, all_proj_sectors, all_proj_geo and all_proj_financiers respectively.

🟠 projects as of 31/03/2025

all_proj_25_temp <- readxl::read_excel(here::here (
   "data", "raw_data", "project3","all_projects_as_of31mar2025.xlsx"), 
      sheet = "World Bank Projects", skip = 1) %>% 
   janitor::clean_names() %>% 
   # keep only project_id with match in all_proj
   filter(project_id %in% all_proj$id)
   
names(all_proj_temp)
names(all_proj_25_temp)


all_proj_25 <- all_proj_25_temp %>% 
   # new = old
   dplyr::rename("id"  = "project_id" ,
          "project_name"  =     "project_name"                      ,
          "pdo_2025"    =     "project_development_objective"     ,
          "impagency"  =     "implementing_agency"               ,
          "cons_serv_reqd_ind"  =     "consultant_services_required"      ,
         # NO url
          "regionname"  =     "region"                            ,
          "countryname" =     "country"                           ,
          
         "projectstatusdisplay"  =     "project_status"                    ,
         "last_stage_reached_name" =     "last_stage_reached_name"           ,
         # "public_disclosure_date"           ,
         "boardapprovaldate"  =     "board_approval_date"               ,
         #"loan_effective_date"              ,
         "closingdate"  =     "project_closing_date"              ,
           
         "projectfinancialtype" = "financing_type" ,
         
         "curr_project_cost" =     "current_project_cost"              ,
         "curr_ibrd_commitment" =     "ibrd_commitment"                   ,
         "curr_ida_commitment"  =     "ida_commitment"                    ,
          "curr_total_commitment"    = "total_ibrd_ida_and_grant_commitment",
         "grantamt"  =     "grant_amount"                      ,

          "borrower"  =     "borrower"                          ,
          "lendinginstr" =     "lending_instrument"                ,
          "envassesmentcategorycode" =     "environmental_assessment_category" ,
          "esrc_ovrl_risk_rate" =     "environmental_and_social_risk"     
          # "associated_project"               ,
          # NO "sector1",
          # NO "sector2",
          # NO "sector3",
          # NO "theme1",
          # NO "theme2"
) |> 
   dplyr::select(
      id, project_name, pdo_2025, impagency, cons_serv_reqd_ind,
      regionname, countryname, projectstatusdisplay, last_stage_reached_name,
      boardapprovaldate, 
      closingdate,
      projectfinancialtype, curr_project_cost, curr_ibrd_commitment,
      curr_ida_commitment, curr_total_commitment, grantamt,
      borrower, lendinginstr, 
      envassesmentcategorycode, esrc_ovrl_risk_rate,
   )
# keep only row with level_1 !=na
all_proj_25_t <- all_proj_25 %>% 
   # keep only project_id with match in all_proj
   filter(id %in% all_proj$id)

Recode dates all_proj_25_t

# Mutate the date columns to parse the dates, handling different formats and blanks
all_proj_25_t <- all_proj_25_t %>%
   # CALL my function !
   f_parse_date(date_columns = date_columns_exe) %>%
   # SPLIT date into year and month
   mutate(boardapproval_year = year(boardapprovaldate),
          boardapproval_month = month(boardapprovaldate)) %>% 
   mutate(boardapprovalFY = case_when( 
      boardapproval_month >= 1 & boardapproval_month < 7 ~ boardapproval_year,
      boardapproval_month >= 7 & boardapproval_month <= 12 ~ boardapproval_year +1)) %>% 
   relocate(boardapprovalFY, .after = boardapprovaldate ) %>% 
   mutate(closingdate_year = year(closingdate),
          closingdate_month = month(closingdate)) %>% 
   mutate(closingdateFY = case_when( 
      closingdate_month >= 1 & closingdate_month < 7 ~ closingdate_year,
      closingdate_month >= 7 & closingdate_month <= 12 ~ closingdate_year +1)) %>% 
   relocate(closingdateFY, .after = closingdate ) 

# Check the results
skimr::skim(all_proj_25_t$boardapprovalFY) # complete_rate 0.779
skimr::skim(all_proj_25_t$closingdateFY) # complete_rate 0.707
skimr::skim(all_proj_25_t$pdo_2025) # complete_rate 0.707

interesting there are 22,210 (i.e. less than 22,569) projects in the 2025 dataset.

🟠 themes

[No clear why there are so many rows per Project ID in the Themes sheet –> not mooving until understood]

all_proj_themes_25_l <- readxl::read_excel(here::here (
   "data", "raw_data", "project3","all_projects_as_of31mar2025.xlsx"), 
      sheet = "Themes", skip = 1) %>% 
   janitor::clean_names() %>% 
   # keep only project_id with match in all_proj
   filter(project_id %in% all_proj$id)
   
# # save next to the original file 
# save(all_proj_themes_25, file = here::here("data", "raw_data", "project2", 
#                                            "all_proj_themes_2025.Rdata") )

# convert to wide format with one row per project_id
# all_proj_themes_25 <- all_proj_themes_25_l %>%
#   tidyr::pivot_wider(
#     id_cols = project_id,
#     names_from = theme_no,
#     values_from = c(level, percentage)
#   )               

✅ Theme 1 from 2025 data

Extract matching project IDs with the largest percentage of level_1 (theme) for each project.

# keep only row with level_1 !=na

all_proj_theme_1_25 <- all_proj_themes_25_l %>% 
   filter(is.na(level_2) & is.na(level_3)) %>% 
   # keep the largest value of percentage_1 among the rows with the same project_id
   group_by(project_id) %>%
   filter(percentage_1 == max(percentage_1)) %>% 
   # keep just one row for each project_id
   slice(1) %>% 
   # drop empty cols
   select(project_id, level_1, percentage_1) %>% 
   rename(
      theme_1_25 =  level_1,
      theme1_perc_25 = percentage_1
   )

🟠 sectors

[Here too, it seems in long fomr but it is not very clear why there are so many rows per Project ID in the Sectors sheet –> not mooving until understandood]

all_proj_sectors_25_l <- readxl::read_excel(here::here (
   "data", "raw_data", "project3","all_projects_as_of31mar2025.xlsx"), 
      sheet = "Sectors", skip = 1) %>% 
   janitor::clean_names() %>% 
   # keep only project_id with match in all_proj
   filter(project_id %in% all_proj$id)

# convert to wide format with one row per project_id
all_proj_sectors_25 <- all_proj_sectors_25_l %>%
  tidyr::pivot_wider(
    id_cols = project_id,
    names_from = major_sector,
    values_from = c(sector, sector_percent)
  )

# # save next to the original file
# save(all_proj_sectors_25, file = here::here("data", "raw_data", "project2", 
#                                            "all_proj_sectors_2025.Rdata") )

✅ Sector 1 from 2025 data

Extract matching project IDs with the largest percentage of sector_percent (sector) for each project.

# keep only row with sector_percent !=na
all_proj_sector_1_25 <- all_proj_sectors_25_l %>% 
   filter(!is.na(sector_percent)) %>% 
   # keep the largest value of sector_percent among the rows with the same project_id
   group_by(project_id) %>%
   filter(sector_percent == max(sector_percent)) %>% 
   # keep just one row for each project_id
   slice(1) %>% 
   rename(
      major_sector1_25 = major_sector  ,
      sector1_25 = sector ,
      sector1_perc_25 = sector_percent
   )

🟠 GEO locations

In long form?

# multiple loc per ID 
all_proj_geo_25 <- readxl::read_excel(here::here (
   "data", "raw_data", "project3","all_projects_as_of31mar2025.xlsx"), 
      sheet = "GEO Locations", skip = 1) %>% 
   janitor::clean_names() %>% 
   # keep only project_id with match in all_proj
   filter(project_id %in% all_proj$id)

# # save next to the original file
# save(all_proj_geo_25, file = here::here("data", "raw_data", "project3", 
#                                            "all_proj_geo_2025.Rdata") )

🟠 Financiers

In long form?

all_proj_financiers_25_l <- readxl::read_excel(here::here (
   "data", "raw_data", "project3","all_projects_as_of31mar2025.xlsx"), 
      sheet = "Financers", skip = 1) %>% 
   janitor::clean_names() %>% 
   # keep only project_id with match in all_proj
   filter(project %in% all_proj$id) %>% 
   rename (project_id = project)

# # save next to the original file
# save(all_proj_financiers_25, file = here::here("data", "raw_data", "project2", 
#                                            "all_proj_financiers_2025.Rdata") )

✅ Join pdo, sector1 and theme1 to all_proj_t

This info may help so I will attach to the existin gfile (but not change ooriginal project list)

# select only the columns of interest
all_proj_theme_1_25 <- all_proj_theme_1_25 %>% 
   select(project_id, theme_1_25, theme1_perc_25)

all_proj_sector_1_25 <- all_proj_sector_1_25 %>%
   select(project_id, major_sector1_25, sector1_25, sector1_perc_25)

all_proj_25_t <- all_proj_25_t %>% 
   select(id, pdo_2025)

# join the data to the original file
all_proj_t <- left_join(
   # add theme1 rom 2025
   all_proj_temp, all_proj_theme_1_25, by = c("id" = "project_id")) %>% 
   relocate(theme_1_25, theme1_perc_25,  .after = theme1) %>% 
   
   # add sector1 from 2025
   left_join(all_proj_sector_1_25, by = c("id" = "project_id")) %>% 
   relocate(major_sector1_25, sector1_25, sector1_perc_25, .after = sector1) %>% 
   
   # add pdo from 2025
   left_join(all_proj_25_t, by = c("id" = "id"))  %>% 
   relocate(pdo_2025, .after = pdo)

With the updated data from 2025, the tagging of sector1_25 (4.4% missing instead of 16.6%) and theme1_25 (30.9% missing instead of 51.9%) is more complete!

_______

LOOKING @ FILE PDOs

Recap content

A lot of PDOs are missing, especially from earlier FYs

  • 22,569 Projects’ IDs, of which…
  • 9,774 (non missing) PDOs … 49,7% are missing
  • 22,569-4981 = 17,588 (non missing) board approval FY … 22.1% are missing
    • ranging from 1947 - 2026

[FUN] Count missing/distinct values all_proj_t

# # Function to count missing values and distinct values in a subset of columns
# f_recap_values <- function(data, columns) {
#    # Select the subset of columns
#    df_subset <- data %>% select(all_of(columns))
# 
#    # 1) Use skimr to skim the data
#    skimmed <- skim(df_subset)
#    
#    # 2) Get the number of rows in the dataset
#    total_rows <- nrow(df_subset)
# 
#    
#    # 3) Calculate the number of distinct values for each column
#    distinct_counts <- df_subset %>%
#       summarise(across(everything(), n_distinct)) %>%
#       pivot_longer(everything(), names_to = "skim_variable", values_to = "n_distinct")
#    
#    # Extract the relevant columns for column names, missing values, and distinct counts
#    missing_table <- skimmed %>%  # 1) 
#       select(skim_variable, n_missing) %>%
#       # Add the total number of rows
#       mutate(total_rows = total_rows) %>%  # 2) 
#       # Join with distinct counts
#       left_join(distinct_counts, by = "skim_variable") %>%  # 3) 
#       relocate(n_distinct, n_missing, .after = total_rows) %>%
#       mutate(missing_perc = round((n_missing/total_rows)*100, 1), 
#              missing_perc = glue::glue("{missing_perc}%")) %>%
#       arrange(desc(n_distinct))
#    
#    # Return the table
#    return(missing_table) 
# }
# 
# # exe use 
# #f_recap_values(df, c("col1","col2"))
# CALL  the function on a subset of columns
f_recap_values(all_proj_t, c("id","pdo","pdo_2025",
                             "projectstatusdisplay", 
                             "boardapprovalFY", "regionname",
                             "sector1","sector1_25",
                             "theme1", "theme_1_25",   
                             "theme1", "projectfinancialtype"))

Combine PDO from 2024 and 2025

# Combine the PDOs from 2024 and 2025
all_proj_t <- all_proj_t %>%
   dplyr::mutate(pdo = coalesce(pdo, pdo_2025)) %>%
   # sector 
   mutate(sector1 = coalesce(sector1, major_sector1_25, sector1_25))  %>% 
   # theme
   mutate(theme1 = coalesce(theme1, theme_1_25)) %>% 
   # drop the 2025 columns
   select(-c(pdo_2025, major_sector1_25, sector1_25, theme_1_25))
# CALL  the function on a subset of columns
f_recap_values(all_proj_t, c("id",
                             "project_name",
                             "pdo", 
                             "projectstatusdisplay", 
                             "boardapprovalFY", 
                             "regionname",
                             "sector1", 
                             "theme1",   
                             "envassesmentcategorycode",
                             "esrc_ovrl_risk_rate",                             "projectfinancialtype"))

# A tibble: 9 × 5   
#   skim_variable             missing_perc
# id                            0%          
# project_name                  7.9%        
# pdo                          31.1%       
# projectstatusdisplay          7.9%        
# regionname                    7.9%        
# sector1                       4.2%        
# theme1                       28.5%       
# envassesmentcategorycode     45.9%       
# esrc_ovrl_risk_rate          92.1%       
# projectfinancialtype         45.2%       
# boardapprovalFY              22.1%  

Which PDOs are missing?

Ideally, the PDOs are missing in a non systematic way. To check it, I compare the distributions across some key features of ALL PROJs v. NON-missing-PDO-PROJs

missing_pdo <- all_proj_t %>% 
   #select(id, pdo, countryname, projectstatusdisplay, lendinginstr, boardapprovalFY, projectfinancialtype) %>% 
   filter(is.na(pdo))

# Now I compare to get a sense of distribution in all_proj_t v. missing_pdo... 
tabyl(all_proj_t$projectstatusdisplay) %>%  adorn_pct_formatting()
tabyl(missing_pdo$projectstatusdisplay) %>%  adorn_pct_formatting()
# Region  (seem same)
tabyl(all_proj_t$regionname)  %>% adorn_pct_formatting() 
tabyl(missing_pdo$regionname)  %>% adorn_pct_formatting() 
# FY (seem same)
tabyl(all_proj_t$boardapprovalFY) %>%  adorn_pct_formatting()
tabyl(missing_pdo$boardapprovalFY) %>%  adorn_pct_formatting()
# too many sectors... (seem same)
tabyl(all_proj_t$sector1)  %>% adorn_pct_formatting() 
tabyl(missing_pdo$sector1)  %>% adorn_pct_formatting() 

# too little filled themes
tabyl(all_proj_t$theme1)  %>% adorn_pct_formatting() # 51.9% NA
tabyl(missing_pdo$theme1)  %>% adorn_pct_formatting() # 67.4%  NA
# Environmental Assessment Category
tabyl(all_proj_t$envassesmentcategorycode)  %>% adorn_pct_formatting() # 92.1%   NA
tabyl(missing_pdo$envassesmentcategorycode)  %>% adorn_pct_formatting() 
# Environmental and Social Risk
tabyl(all_proj_t$esrc_ovrl_risk_rate)  %>% adorn_pct_formatting() # 98.4%  NA
tabyl(missing_pdo$esrc_ovrl_risk_rate)  %>% adorn_pct_formatting() 
# by fin instrum 
tabyl(all_proj_t$projectfinancialtype)  %>% adorn_pct_formatting() 
tabyl(missing_pdo$projectfinancialtype)  %>% adorn_pct_formatting() 
# by lending instrum
tabyl(all_proj_t$lendinginstr)  %>% adorn_pct_formatting()  # Spec Inv Loan 6567 |29.1%
tabyl(missing_pdo$lendinginstr)  %>% adorn_pct_formatting() # Spec Inv Loan 4928 |43.9%

[Chi-Squared Test in the case of projectfinancialtype]

Goodness of fit test to see if the distribution of projectfinancialtype in the sample is significantly different from the population.

# Chi-Square Test: if a discrete distribution of a sample reflects the population, you can use a chi-square goodness-of-fit test

# by financialtype 
# Population distribution (excluding NAs)
pop <- all_proj_t %>%
  filter(!is.na(projectfinancialtype)) %>%
  count(projectfinancialtype, name = "n") %>%
  mutate(percent = n / sum(n))

# Observed distribution (excluding NAs)
obs <- missing_pdo %>%
  filter(!is.na(projectfinancialtype)) %>%
  count(projectfinancialtype, name = "n") %>% 
  mutate(percent = n / sum(n))

# Ensure both pop and obs are aligned by category
combined <- obs %>%
  inner_join(pop, by = "projectfinancialtype", suffix = c("_obs", "_pop"))


# Extract observed counts and expected proportions (from the population)
observed <- combined$n_obs
expected_prop <- combined$percent_pop
observed
expected_prop

# Perform the chi-square test
chisq_test <- chisq.test(observed, p = expected_prop)

# Display the results
chisq_test
# data:  observed
# X-squared = 3421.4, df = 4, p-value < 0.00000000000000022

# low p-value (typically < 0.05) suggests that the sample distribution is significantly different from the population!!!!

[Chi-Squared Test in the case of lendinginstr]

Goodness of fit test to see if the distribution of lendinginstr in the sample is significantly different from the population.

# Chi-Square Test: if a discrete distribution of a sample reflects the population, you can use a chi-square goodness-of-fit test

# by fin lendinginstr 
# Population distribution (excluding NAs)
pop2 <- all_proj_t %>%
  filter(!is.na(lendinginstr)) %>%
  count(lendinginstr, name = "n") %>%
  mutate(percent = n / sum(n))

# Observed distribution (excluding NAs)
obs2 <- missing_pdo %>%
  filter(!is.na(lendinginstr)) %>%
  count(lendinginstr, name = "n") %>% 
  mutate(percent = n / sum(n))

# Ensure both pop and obs are aligned by category
combined2 <- obs2 %>%
  inner_join(pop2, by = "lendinginstr", suffix = c("_obs", "_pop"))

combined2

# Extract observed counts and expected proportions (from the population)
observed2 <- combined2$n_obs
# Normalize expected_prop2 so they sum to 1
expected_prop2 <- combined2$percent_pop / sum(combined2$percent_pop)

# Perform the chi-square test
chisq_test2 <- chisq.test(observed2, p = expected_prop2)

# Display the results
chisq_test2
# data:  observed
#X-squared = 5659.6, df = 15, p-value < 0.00000000000000022

# low p-value (typically < 0.05) suggests that the sample distribution is significantly different from the population!!!!

Based on some available project features, I would say that even if many projects have missing feature value, PDO texts’ missingness seems to happen at random, except maybe for:

  • approval FY
  • projectfinancialtype (Grants, IBRD, etc) but there are many missing type (many more in with-PDO sample)…
  • lendinginstr specific Investment Loan are missing PDO in 4928 pr (43.9%).

Why? + Chi-square tests are sensitive to sample size. Even small percentage differences can become statistically significant when the sample size is large (which is the case here!)

# Prep data for plotting
combined_long <- combined %>%
  select(projectfinancialtype, percent_obs, percent_pop) %>%
  pivot_longer(
    cols = starts_with("percent"),
    names_to = "source",
    values_to = "percent"
  ) %>%
  mutate(
    source = recode(source,
      "percent_obs" = "Sample (missing PDO)",
      "percent_pop" = "Population"
    )
  )

# Plot
# Plot
ggplot(combined_long, aes(x = projectfinancialtype, y = percent, fill = source)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    title = "Distribution of Project Financial Type",
    x = "Financial Type",
    y = "Percentage",
    fill = "Source"
  ) +
  theme_minimal(base_size = 13)

# Prep data for plotting
combined_long2 <- combined2 %>%
  select(lendinginstr, percent_obs, percent_pop) %>%
  pivot_longer(
    cols = starts_with("percent"),
    names_to = "source",
    values_to = "percent"
  ) %>%
  mutate(
    source = recode(source,
      "percent_obs" = "Sample (missing PDO)",
      "percent_pop" = "Population"
    )
  )

# plot
ggplot(combined_long2, aes(x = lendinginstr, y = percent, fill = source)) +
  geom_col(position = "dodge") +
  scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
  labs(
    title = "Distribution of Lending Instrument",
    x = "Lending Instrument",
    y = "Percentage",
    fill = "Source"
  ) +
  theme_minimal(base_size = 13) +
   # Rotate x-axis labels
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

_______

TEXT PREPROCESSING

Save intermediate data

# origin
saveRDS(all_proj_t, here("data" , "derived_data", "all_proj_t.rds"))#  11,279 obs

Obtain Reduced df projs

# check which years are missing pdos 
pdo_by_year <- all_proj_t %>%
  group_by(boardapprovalFY) %>%
  summarise(
    pdo_missing = sum(is.na(pdo)),
    pdo_not_missing = sum(!is.na(pdo)),
    .groups = "drop"
  )

# Step 1: Find the first year with pdo_not_missing == 0
first_year_pdo <- pdo_by_year %>%
  filter(pdo_not_missing != 0) %>%
  summarise(first_year = min(boardapprovalFY, na.rm = TRUE)) %>%
  pull(first_year)



# Step 2: Sum pdo_missing for years before that
pjs_before_pdo <- pdo_by_year %>%
  filter(boardapprovalFY < first_year_pdo) %>%
  summarise(total_missing = sum(pdo_missing))

first_year_pdo
pjs_before_pdo

#save them  
saveRDS(first_year_pdo, here("data" , "derived_data", "first_year_pdo.rds"))
saveRDS(pjs_before_pdo, here("data" , "derived_data", "pjs_before_pdo.rds"))

For my purposes, it is safe to drop all the projects with missing PDO !

  • it turns out there are no Project Development Objectives spelled out until FY2001, so the usable population of projects (with PDO text) spans from FY 2001 to FY 2023.
  • I will also drop the projects with missing projectstatusdisplay (status)
  • … and boardapprovalFY (year of approval) > 2024 as incomplete data
projs <- all_proj_t %>%                      # 22,659
   filter(!is.na(pdo)) %>%                   # 11,353  -> 15,539
   filter( pdo != ".") %>%                   # 11,346 -> 15,532
   filter( pdo != "-") %>%                   # 11,344 -> 15,530
    filter( pdo != "NA") %>%                  # 11,344 -> 15,530
    filter( pdo != "N/A") %>%                 # 11,342 -> 15,527
    filter( pdo != "No change") %>%           # 11,340 -> 15,525
    filter( pdo != "No change to PDO following restructuring.") %>%   # 11,337 -> 15,522
    filter( pdo != "PDO remains the same.") %>%  # 11,335 -> 15,520
    filter(!(pdo %in% c("TBD", "TBD.", "Objective to be Determined."))) %>%   # 11,326 --> 15,500
    filter(!(pdo %in% c("XXXXXX", "XXXXX", "XXXX", "a"))) %>%   # 11,322 --> 15,496 = VALID PDO AVAILABLE
    filter(!is.na(projectstatusdisplay)) %>%  # 11,319  --> 14,076 [diff 1420]
    filter(!is.na(boardapprovalFY)) %>% # 9,143  --> 11,656 [diff 2420]
    filter(boardapprovalFY < 2024)   # 8811    --> 11,279 [diff 377]
   # # --- keep only the columns of interest
   # select(id, 
   #        pr_name = project_name, 
   #        pdo, 
   #        boardapprovalFY, 
   #        closingdateFY,
   #        status = projectstatusdisplay, 
   #        regionname, countryname, 
   #        sector1, 
   #        #sector1_perc_25, sector2, sector3, 
   #        theme1 ,
   #        #theme1_perc_25, theme2,  
   #        lendinginstr,
   #        env_cat = envassesmentcategorycode, 
   #        ESrisk = esrc_ovrl_risk_rate ,
   #        curr_total_commitment)   

[Tbl] for recapping missing tracking

# Define a named list of filtering steps using anonymous functions (lambdas).
# Each step applies a filter condition to remove unwanted values in the data.
steps <- list(
  "Remove pdo NA" = ~ filter(.x, !is.na(pdo)),
  "Remove pdo '.'" = ~ filter(.x, pdo != "."),
  "Remove pdo '-'" = ~ filter(.x, pdo != "-"),
  "Remove pdo 'NA'" = ~ filter(.x, pdo != "NA"),
  "Remove pdo 'N/A'" = ~ filter(.x, pdo != "N/A"),
  "Remove pdo 'No change'" = ~ filter(.x, pdo != "No change"),
  "Remove pdo full 'No change' sentence" = ~ filter(.x, pdo != "No change to PDO following restructuring."),
  "Remove 'PDO remains the same.'" = ~ filter(.x, pdo != "PDO remains the same."),
  "Remove pdo TBD values" = ~ filter(.x, !pdo %in% c("TBD", "TBD.", "Objective to be Determined.")),
  "Remove pdo XXX placeholders" = ~ filter(.x, !pdo %in% c("XXXXXX", "XXXXX", "XXXX", "XXX", "a")),
  "Remove NA projectstatusdisplay" = ~ filter(.x, !is.na(projectstatusdisplay)),
  "Remove NA boardapprovalFY" = ~ filter(.x, !is.na(boardapprovalFY)),
  "Filter boardapprovalFY < 2024" = ~ filter(.x, boardapprovalFY < 2024)
)

# Initialize an empty tracking table
tracking <- tibble(step = character(), n_rows = integer(), excluded = integer())

# Start with the full dataset
current <- all_proj_t
# 
tracking <- add_row(tracking, step = "Start", n_rows = nrow(current), excluded = NA)

# Loop over each step, applying the filter and tracking row counts
for (i in seq_along(steps)) {
  name <- names(steps)[i]
  fn <- as_function(steps[[i]])  # Convert formula to function
  before <- nrow(current)
  current <- fn(current)         # Apply the filter
  after <- nrow(current)
  
  tracking <- add_row(
    tracking,
    step = name,
    n_rows = after,
    excluded = before - after
  )
}

# Final cleaned dataset
projs <- current
saveRDS(tracking, here("data" , "derived_data", "tracking.rds"))#  11,279 obs
# Summary of row counts at each filtering step
tracking_k <-  tracking %>%  kable()
tracking_k 
step n_rows excluded
Start 22569 NA
Remove pdo NA 15539 7030
Remove pdo ‘.’ 15532 7
Remove pdo ‘-’ 15530 2
Remove pdo ‘NA’ 15530 0
Remove pdo ‘N/A’ 15527 3
Remove pdo ‘No change’ 15525 2
Remove pdo full ‘No change’ sentence 15522 3
Remove ‘PDO remains the same.’ 15520 2
Remove pdo TBD values 15500 20
Remove pdo XXX placeholders 15495 5
Remove NA projectstatusdisplay 14075 1420
Remove NA boardapprovalFY 11655 2420
Filter boardapprovalFY < 2024 11278 377
saveRDS(t, here("data" , "derived_data", "tracking_k.rds"))#  11,279 obs
# delete all in env EXCEPT for c(... )
rm(list=setdiff(ls(), c("projs", "cnames" )))

Duplicated PDOs 🤯

str(projs)
skim(projs$id) # 11,278
skim(projs$pdo) # 9914

n_distinct(projs$id) # 11,278
ndist_pdo <- n_distinct(projs$pdo) # 9914 !!!!!!!! 

 
# Detect duplicated PDOs and the corresponding IDs
duplicates <- projs %>%
  group_by(pdo) %>%
  filter(n() > 1) %>%   # Filter PDOs that appear more than once
  select(id, pdo)       # Select only id and pdo columns

# View the duplicated PDOs and their corresponding IDs
nrow(duplicates) # 2461

# Count duplicates per each unique ID based on duplicated PDOs
count_dp <- duplicates %>%          # Filter PDOs that appear more than once
  group_by(pdo) %>%             # Group by id
  summarise(dup_count = n())   # Count occurrences of duplicated PDOs per id

min(count_dp$dup_count)    # 2
max(count_dp$dup_count)    # 13
mean(count_dp$dup_count)   # 2.243391
n_distinct(duplicates$id)  # 2461 projectes with duplicated PDOs
n_distinct(duplicates$pdo) # 1097 unique PDOs are duplicated in the dataset

There are 2461 projects with NON-UNIQUE PDO text in the dataset. In some cases, evidently, the same PDO is used for multiple projects (from a minimum of 2 to a maximum of 13 time!!!), most likely when there is a parent project or subsequent phases of the same.

Manual correction text [projs] 🤚🏻

(Here I re-wrote the PDO for some projects that had misspelled words, lacking spaces, some typos and HTML formatting characters. No substantial changes)

projs$pdo[projs$id == "P164414"] <- "The Multisector Development Policy Financing (DPF) intends to support Ukraine's highest priority reforms to move from economic stabilization to stronger and sustained economic growth by addressing deeper structural bottlenecks and governance challenges in key areas. Possible policy areas include : (i) strengthening private sector competitiveness, including reforming land markets and the financial sector; (ii) promoting sustainable and effective public services, including reforming pensions, social assistance, and health; and (iii ) improving governance, including reforming anticorruption institutions and tax administration. The financing  DPL or Policy Based Guarantee (PBG)."

projs$pdo[projs$id == "P111432"] <- "Project development objectives for RCIP 3 include the following: Malawi: Support the Recipient's efforts to improve the quality, availability and affordability of broadband within its territory for both public and private users. Mozambique: Support the Recipient's efforts to contribute to lower prices for international capacity and extend the geographic reach of broadband networks and to contribute to improved efficiency and transparency through eGovernment applications. Tanzania: Support the Recipient's efforts to: (i) lower prices for international capacity and extend the geographic reach of broadband networks; and (ii) improve the Government's efficiency and transparency through eGovernment applications."

projs$pdo[projs$id == "P252350"] <- "The Program Development Objective is to expand opportunities for the acquisition of quality, market-relevant skills in selected economic sectors. The selected economic sectors include Energy, Transport and Logistics, and Manufacturing (with a focus on ‘Made-Rwanda’ products such as construction materials, light manufacturing and agro-processing). Building skills to advance the country’s economic agenda is a key priority of the GoR’s ongoing Economic Development and Poverty Reduction Strategy-2 (EDPRS2) launched in 2013. EDPRS2 builds on the country’s Vision 2020 which seeks to transform the country by raising its per capita GDP to middle-income level by 2020. The Program is grounded in the Government of Rwanda’s (GoR) National Employment Programs (NEP) approved by Cabinet in 2014. NEP was designed to address the employment challenges in Rwanda and equip its population with the skills required to supporteconomic development. The main results areas of the operation are: (i) reinforcing governance of the skills development system; (ii) ensuring provision of quality training programs with market relevance; (iii) expanding opportunities for continuous upgrading of job-relevant skills for sustained employability; and (iv) capacity building for implementation. The Program will disburse against achievement of specific Disbursement Linked Results (DLRs) in these results areas"

projs$pdo[projs$id == "P143376"] <- "The Project main objective is to promote the reduction of climate change impacts in the Cerrado of the west of the Bahia State by: (i) promoting the environmental regularization of land holdings in the targeted municipalities and support actions to promote recovery of environmental liabilities; and (ii) strengthening the State's capacity to prevent and combat forest fires through the integration of local actors and promoting the adoption of sustainable production practices in the targeted municipalities."

projs$pdo[projs$id == "P129428"] <- "The development objective of this task is to support the Ministry of Finance and other agencies with the Government of the Dominican Republic in their efforts to improve public expenditure management. Specifically, the objectives to be financed are to: (i) design and implement and integrate multiyear planningand budgeting; (ii) design and implement a Treasury Single Account for the central administration; and (iii) improve public sector procurement policies and practices."

 
projs$pdo[projs$id == "P127665"] <- "This development policy loan supports the Government of Croatia's reform efforts with the aim to: (i) enhance fiscal sustainability through expenditure-based consolidation; and (ii) strengthen investment climate."

projs$pdo[projs$id == "P075523"] <- "The project’s development objectives are to: (i) enhance the economic, environmental and social sustainability of transport and coastal infrastructure assets; and (ii) manage those assets and natural resources and respond to national emergencies, through an effective partnership with private sector stakeholders."

projs$pdo[projs$id == "P098889"] <- "The project development objective will be changed to benefit cotton farmers and create the conditions for sustainable growth of cotton production in low-income areas of Tajikistan through improved policy environment an better access to finance."

projs$pdo[projs$id == "P151058"] <- "The objective of the proposed project is to support the development of a seafood cluster in Nouadhibou that promotes sustainable management of fisheries and generates value for the communities."

projs$pdo[projs$id == "P173680"] <- "The Program Development Objective is to support economic stability and inclusive growth in Rwanda by (i) promoting long-term fiscal sustainability, (ii) enhancing coverage and equity in access to critical social sector programs, (iii) enabling learning and development of children, and (iv) strengthening decentralized capacity and accountability for accelerated human capital development."

projs$pdo[projs$id == "P159132"] <- "The PDO is to strengthen capacity at NEPAD Agency to support TerrAfrica member countries on knowledge management and actions for scale-up Sustainable Land and Water Management (SLWM). The project objectives have been further specified along the project's three components: Component 1, Support for Regional Coalition Building: Support NEPAD Agency to reinforce the regional coalition that aligns assistance to countries on SLWM and related issues. Component 2, Support for Mutual Learning Among Member Countries:  Support NEPAD Agency's mandate to meet country demand for knowledge and best practice tools to improve land management and climate resilience while maintaining natural capital to reinforce on-going investment operations in countries, as well as national and secto r development planning. Component 3, Support for Member Countries' SLWM Investment Planning: Strengthen NEPAD Agency's role in mee ting country demand for expertise and tools to develop and implement SLWM work programs and national SLWM investment frameworks."


projs$pdo[projs$id == "P122642"] <- "The project development objectives are: (i) improve the administration of the public pension schemes; and (ii) pilot a modest social safety net program as a first step to developing a sustainable approach to safety nets in the country.  The project development objective is to: (i) improve the administration of the public pension schemes; and (ii) develop administrative systems for safety nets interventions, with focus on targeting and benefit payment delivery, while delivering cash benefits to the poorest families in targeted pilot districts."

projs$pdo[projs$id == "P155961"] <- "The Project Development Objective (PDO) is to improve the accessibility to best practices and new knowledge on fisheries management of selected SSA countries. More specifically, this project will: (a) Empower the AU to take the lead on a long lasting fisheries partnership mechanism for Africa, leading to regional policy reforms; (b) Support knowledge exchange, advisory services, data management and monitoring support for the various current and pipeline investments in the region to ensure that lessons from successes and failures of country and LME-level investments and newly generated knowledge on fisheries are adequately disseminated, for a common understanding of the fisheries reforms necessary to meet the WSSD and the 2014 Policy Framework targets; and (c) Support individual coastal countries to access financing to support policy reform and investment through technical support in preparation of fund ing proposals. Attention will be given to reflecting lessons learned from other similar projects, replicating successful interventions, and ensuring continued technical support during project implementation – so this portfolio becomes a series of case studies for the region. The Regional Partnership for African Fisheries Policy Reform is also a follow-up phase medium-sized project to the GEF/World Bank Strategic Partnership project, which was implemented by the AU between 2009 and 2011, and the last project planned un der the Program Framework Document “Strategic Partnership for Sustainable Fisheries Management in the Large Marine Ecosystems in Africa” (P125797). The Project will be financed through a US$2 million Bank-Executed GEF Trust Fund between 2015 and 2018 for a duration of 3 years. It will act as a regional knowledge and monitoring hub for a SSA GEF/World Bank portfolio of current and pipeline fisheries reform projects (“participating projects”) and related partner-supported activities, with the objective of improving the accessibility to best practices and new knowledge on fisheries management of selected SSA countries. Within this context, it is expected th at the Project will assist African coastal countries to make concrete progress towards achieving the fisheries and poverty reduction  targets set by the 2002 WSSD, the follow-up 2012 United Nations Conference on Sustainable Development (Rio+20), and the 2014 Policy  Framework and Reform Strategy for Fisheries and Aquaculture in Africa. The Project is designed around three thematic components as follows (see Annex C for further detail): Component 1: Coalition building (US$0.75 million) Component 2: Knowledge support (US$1 .05 million) Component 3: Portfolio support (US$0.20 million)."

projs$pdo[projs$id == "P112186"] <- "The project objective is to improve the institutional and infrastructure environment for farmers and herders, with a strong emphasis on the livestock sector. More specifically, the project will increase farmers productivity, particularly of livestock farmers in the project areas and reduce animal diseases that have a public health impact (e.g., brucellosis).  The project will provide critical capital investments, strengthen key support services, deliver appropriate know-how, and facilitate and support effective and sustainable management of the country's valuable pasture resources.  This would enable farmers and herders to improve the productivity, profitability and sustainability of their enterprises and thereby help reduce poverty. Through its specific support for a regionally focused demonstration program to control brucellosis, the project would in addition contribute significantly to improved public health. The revised PDO is to improve the Recipient's institutional and infrastructure environment for more productive, profitable and sustainable livestock and crop production by pasture users and smallholder farmers, as well as to reduce the economic impact of the zoonotic disease burden in the human population."


projs$pdo[projs$id == "P099010"] <- "The seventh poverty reduction support credit for Burkina Faso is the first of a third series of PRSCs in support of Burkina Faso's Poverty Reduction Strategy (PRS), adopted in 2000 and updated in 2004. The PRS and its latest Annual Progress Report cover the period 2006 to 2008. This new series of PRSCs (7, 8 and 9) will support the acceleration of real growth and the reduction of poverty incidence in Burkina Faso by: (i) improving the investment climate and promoting exports; (ii) improving access to basic social services through deepened decentralization and strengthened institutional capacity; and (iii) promoting efficiency, transparency and accountability in the use of public resources through enhanced public financial management."


projs$pdo[projs$id == "P069934"] <- "The development objectives of the Pernambuco Integrated Development: Education Quality Improvement Project are to (a) improve the quality, efficiency, and inclusiveness of the public education system; (b) modernize and strengthen the managerial, financial, and administrative capacity of the Secretariat of Education to set policies and guidelines for the sector and deliver public education efficiently; and (c) support the overall state modernization effort through interventions to be carried out in the Secretariat of Education and to be replicated in other state institutions."

projs$pdo[projs$id == "P082278"] <- "The objective of the proposed PAL series would be to enhance economic growth in Croatia through (i) improving the investment climate, and (ii) reducing size and improving efficiency of the public sector. This will require fiscal adjustment, which is a theme running through the proposed operation, given the Croatia's fiscal position. The proposed PAL would also support the Government in its EU accession and integration efforts, and help ensure that key economic reforms under the acquis are effectively implemented and enforced. The PAL series would specifically support: Improvements in the investment climate by re-launching the privatization or liquidation of state-owned enterprises, improving financial discipline of enterprises, reducing the cost of doing business, and strengthening the judiciary. Efforts to improve governance by rationalizing and improving efficiency of public administration, and improving public expenditure management. Measures to enhance fiscal sustainability of sector programs by addressing health financing, rationalizing social benefits, improving fiscal and social sustainability of the pension system, and fiscal sustainability of railways operations." #P082278

projs$pdo[projs$id == "P084302"] <- "The overarching project objective is to improve the efficiency and effectiveness of irrigation water distribution in three AWBs (Ghotlu, Nara and Left Bank), particularly with respect to measures of reliability, equity and user satisfaction. This would be achieved by: (a) deepening and broadening the institutional reforms that are already underway in Sindh; (b) improving the irrigation system in a systematic way covering key hydraulic infrastructure, main and branch canals, and distributaries and minors; and (c) enhancing long-term sustainability of irrigation system through participatory irrigation management and developing institutions for improving operation and maintenance of the system and cost recovery. The improved water management would lead to increased agricultural production, employment and incomes over some about 1.8 million ha or more than 30 percent of the irrigated area in Sindh, and one of the poorest regions of the country." #P084302

projs$pdo[projs$id == "P094233"] <- "The Project Development Objective is to support the adoption of environmentally friendly land use practices on 3,400 hectares in two key Atlantic Forest watersheds in Espirito Santo, thereby contributing to biodiversity conservation. Key indicators include: Increase in area (3,400 has) under sustainable land use practices’ (SLMs). Which will be accomplished in part through the: (i) establishment and implementation of institutional arrangements for payment mechanisms for watershed conservation; and (ii) supporting the adoption of Sustainable Land Management (SLM) practices through the implementation of Payment for Environmental Services mechanisms and the participation and capacity building of local actors. The Global Environment Objective is to reduce threats to globally important biodiversity (for example, the endangered Muriqui monkey) from agricultural production systems and increase habitat for species in two key rainforest watersheds of the Atlantic Forest. Key indicators include: Area of critical habitat restored and/or protected from encroachment; which will be accomplished in part through the: (i) implementation of sustainable market-based mechanisms to finance Protected Areas (PA) management and conservation of biodiversity in agricultural landscapes." #P094233

projs$pdo[projs$id == "P106641"] <- "The DPL program supports staged progress towards medium-term program outcomes related to two CPS objectives: promoting broadbased economic growth by means of measures designed to enhance Panama's competitiveness, and establishing modern public financial management systems and institutions." #P106641

projs$pdo[projs$id == "P116125"] <- "The program has the following objectives: (i) to support structural reforms that will ensure fiscal consolidation over the medium term; (ii) to protect priority investment programs and the upfront fiscal costs of structural reforms critical to meeting Poland's goals of convergence with the rest of the EU; and (iii) to mitigate the social cost of the economic crisis." #P116125

projs$pdo[projs$id == "P150038"] <- "The overall Project goal is to provide support to the CIS national statistical offices to strengthen their capacity in collection of regular labor statistical data on the basis of uniformed methodology; to enhance knowledge exchange at national and international levels and to upgrade CIS and ILO statistical databases on labor. Specific Project's objectives are: modernisation of current labor statistics systems in the CIS countries through enhancing capacity of their national statistical services and other agencies producers of labor data by increasing the use of modern data collection tools, international methodology, definitions, standards and classifications; upgrading of currently conducted surveys such as household-based labor force surveys and/or enterprise-based surveys, population census; development of methodology for implementation of labor force surveys on a regular basis, including data collection on labor migration and employment in informal sector of economy."  #P150038

projs$pdo[projs$id == "P131325"] <- "The overarching project objective is to improve the efficiency and effectiveness of irrigation water distribution in three AWBs (Ghotlu, Nara and Left Bank), particularly with respect to measures of reliability, equity and user satisfaction. This would be achieved by: (a) deepening and broadening the institutional reforms that are already underway in Sindh; (b) improving the irrigation system in a systematic way covering key hydraulic infrastructure, main and branch canals, and distributaries and minors; and (c) enhancing long-term sustainability of irrigation system through participatory irrigation management and developing institutions for improving operation and maintenance of the system and cost recovery. The improved water management would lead to increased agricultural production, employment and incomes over some about 1.8 million ha or more than 30 percent of the irrigated area in Sindh, and one of the poorest regions of the country." #P131325

projs$pdo[projs$id == "P162274"] <- "The objective of the Project is to support the efforts of the Recipient to: (i) demobilize members of armed groups of Rwandan origin and members of the Rwandan Defense Force; and (ii) provide socioeconomic reintegration support to said members following demobilization, with a particular focus on the provision of such support to female, child, and disabled ex-combatants." #P162274

projs$pdo[projs$id == "P162166"] <- "The project's development objective is to empower citizens through social accountability interventions, to improve the utilization of post-Ebola recovery funds in the health and the education sectors in targeted districts of the member country." #P162166
Get rid of HTML tags like

and

and return just the inner text (e.g., “some text.”)

projs <- projs %>%
  mutate(pdo = stringr::str_replace_all(pdo, "<[^>]+>", ""))

_______

SAMPLE SPLITTING

Splitting the sample into training, validation, and test sets. In case of validating ML models but also to work on something smaller.

# ensure we always get the same result when sampling (for convenience )
set.seed(12345)

# use `regionname` as strata 
tabyl(projs$regionname)

projs_split <- projs %>%
  # define the training proportion as 75%
   rsample::initial_validation_split(prop = c(0.50, 0.25),
                                     # ensuring both sets are balanced in gender
                                     strata = regionname)

# resulting 3 datasets
projs_train <- rsample::training(projs_split)
projs_val <- rsample::validation(projs_split)
projs_test <- rsample::testing(projs_split)

nrow(projs_train) # 5,637
nrow(projs_val) # 2,820
nrow(projs_test) # 2,821
tabyl(projs_train$regionname)
tabyl(projs_val$regionname)
tabyl(projs_test$regionname)

_______

[from here I work on projs_train only]

_______

PoS tagging + tokenization with cleanNLP

PoS tagging with cleanNLP

Here’s the general process for tagging (or “annotating”) text with the cleanNLP package:

  1. Make a dataset where one column is the id (line number, chapter number, book+chapter, etc.), and another column is the text itself.
  2. Initialize the NLP tagger. You can use any of these:
    • cnlp_init_udpipe(): Use an R-only tagger that should work without installing anything extra (a little slower than the others, but requires no extra steps!)
    • cnlp_init_spacy(): Use spaCy (if you’ve installed it on your computer with Python)
    • cnlp_init_corenlp(): Use Stanford’s NLP library (if you’ve installed it on your computer with Java)
  3. Feed the data frame from step 1 into the cnlp_annotate() function and wait.
  4. Save the tagged data on your computer so you don’t have to re-tag it every time.

Prep HYPHENATED words

# Replace hyphens with a placeholder before annotation
pdo_train_to_tag <- projs_train %>%
   select(id, pdo) %>%
   mutate(pdo_2 = case_when(
      str_detect(pdo, "scale-up|scaled-up") ~ str_replace_all(pdo, "scale-up|scaled-up", "SCALEUPPLACEHOLDER"), # 17 
      str_detect(pdo, "follow-up") ~ str_replace_all(pdo, "follow-up", "FOLLOWUPPLACEHOLDER"), # 6
      str_detect(pdo, "know-how") ~ str_replace_all(pdo, "know-how", "KNOWHOWPLACEHOLDER"), # 5 
      str_detect(pdo, "Covid-19|COVID-19|covid-19") ~ str_replace_all(pdo, "Covid-19|COVID-19|covid-19", "COVIDPLACEHOLDER"), # ???
      # last !!!!!!!!
      str_detect(pdo, "(?<=\\p{L})-(?=\\p{L})") ~ str_replace_all(pdo, "-", "HYPHENWORD"), # 1396 
      TRUE ~ pdo
   )) 

☣️☣️⚠️ [LOOONG] Annotate with cleanNLP::cnlp_annotate()

[Only the 1st time]

# ---- Initialize the tagger
cleanNLP::cnlp_init_udpipe()

# ---- Use the built-in R-based tagger
pdo_train_tagged <- cleanNLP::cnlp_annotate(pdo_train_to_tag,
                                  text_name = "pdo_2",
                                  doc_name = "id")

# ---- save the input data as .rds
saveRDS(pdo_train_tagged, here("data","derived_data","pdo_train_tagged.rds"))

[Conditionally re-annotate]

# save the input data as .rds
saveRDS(pdo_train_to_tag, here("data","derived_data","pdo_train_to_tag.rds"))
# define the path to save the tagged data
input_path <- here("data","derived_data","pdo_train_to_tag.rds")
output_path <- here("data","derived_data","pdo_train_tagged.rds")
# ---- Initialize the tagger
cleanNLP::cnlp_init_udpipe()

# ---- Conditionally annotate the data only if it has changed since the last time
# Check if previous input data exists; if not, save it for the first time
if (!file.exists(input_path)) {
  # Save pdo_train_to_tag to initialize the tracking
  saveRDS(pdo_train_to_tag, input_path)
  message("Initial input saved.")
}

# Load the previous input if it exists
previous_input <- readRDS(input_path)

# Only re-run if the input data has changed
if (!identical(pdo_train_to_tag, previous_input)) {
  # Run the annotation
  pdo_train_tagged <- cleanNLP::cnlp_annotate(pdo_train_to_tag, 
                                              text_name = "pdo_2", 
                                              doc_name = "id")
  
  # Save the updated input and annotated result
  saveRDS(pdo_train_to_tag, input_path)
  saveRDS(pdo_train_tagged, output_path)
  
  message("Annotation updated.")
} else {
  # Load the previously saved output if data is unchanged
  if (file.exists(output_path)) {
    pdo_train_tagged <- readRDS(output_path)
    message("Loaded previous annotation.")
  } else {
    message("No previous annotation found. Please run the annotation first.")
  }
}

Manipulate the tagged data

# ---- Extract $token & Convert annotations to a tibble (data frame format)
pdo_train_tag_t <-  pdo_train_tagged$token %>% 
   # ... convert annotations to a tibble (data frame format)
   as_tibble()    # 225,360

# ---- Replace the placeholder back with a hyphen
pdo_train_tag_t <- pdo_train_tag_t %>% 
   mutate(word = token) %>%
   
   # Replace placeholders in the 'token' column
   mutate(token = case_when(
      str_detect(word, "HYPHENWORD") ~ str_replace_all(word, "HYPHENWORD", "-"),
      str_detect(word, "SCALEUPPLACEHOLDER") ~ str_replace_all(word, "SCALEUPPLACEHOLDER", "scale-up"),
      str_detect(word, "FOLLOWUPPLACEHOLDER") ~ str_replace_all(word, "FOLLOWUPPLACEHOLDER", "follow-up"),
      str_detect(word, "KNOWHOWPLACEHOLDER") ~ str_replace_all(word, "KNOWHOWPLACEHOLDER", "know-how"),
      str_detect(word, "COVIDPLACEHOLDER") ~ str_replace_all(word, "COVIDPLACEHOLDER", "covid-19"),
      TRUE ~ word
   )) %>%
   
   # Replace placeholders in the 'token_ws' column
   mutate(token_with_ws = case_when(
      str_detect(token_with_ws, "HYPHENWORD") ~ str_replace_all(token_with_ws, "HYPHENWORD", "-"),
      str_detect(token_with_ws, "SCALEUPPLACEHOLDER") ~ str_replace_all(token_with_ws, "SCALEUPPLACEHOLDER", "scale-up"),
      str_detect(token_with_ws, "FOLLOWUPPLACEHOLDER") ~ str_replace_all(token_with_ws, "FOLLOWUPPLACEHOLDER", "follow-up"),
      str_detect(token_with_ws, "KNOWHOWPLACEHOLDER") ~ str_replace_all(token_with_ws, "KNOWHOWPLACEHOLDER", "know-how"),
      str_detect(token_with_ws, "COVIDPLACEHOLDER") ~ str_replace_all(token_with_ws, "COVIDPLACEHOLDER", "covid-19"),   
      TRUE ~ token_with_ws
   )) %>%
   
   # Replace placeholders in the 'lemma' column
   mutate(lemma = case_when(
      str_detect(lemma, "HYPHENWORD") ~ str_replace_all(lemma, "HYPHENWORD", "-"),
      str_detect(lemma, "Scaleupplaceholder") ~ str_replace_all(lemma, "Scaleupplaceholder", "scale-up"),
      str_detect(lemma, "Followupplaceholder") ~ str_replace_all(lemma, "Followupplaceholder", "follow-up"),
      str_detect(lemma, "Knowhowplaceholder") ~ str_replace_all(lemma, "Knowhowplaceholder", "know-how"),
      str_detect(lemma, "Covidplaceholder") ~ str_replace_all(lemma, "Covidplaceholder", "covid-19"),
      TRUE ~ lemma
   )) %>%
 
   relocate(word, .after = doc_id) # 225,360

Check for duplicates

# check  (old =4403)
projs_train$id %>% n_distinct() #  5637
pdo_train_tagged$token$doc_id %>% n_distinct() #  5637 OK!
pdo_train_tag_t$doc_id %>% n_distinct() #  5637 OK! 

Re-add cols from projs_train to pdo_train_t

# ---- RE-add cols from `projs_train_token ` `
pdo_train_t <-  pdo_train_tag_t %>% 
   left_join(projs_train, by = c("doc_id" = "id"))  

# ---- Select cols 
pdo_train_t <- pdo_train_t %>% 
   select(
      proj_id=  doc_id                ,
      pdo,                                               # [FROM projs_train]
      sid , # sentence ID
      tid , # token ID within sentence
      word,
      token, # Tokenized form of the token.
      token_with_ws, # Token with trailing whitespace.
      lemma, # Lemmatized form of the token.
      upos, # Universal part-of-speech tag.
      xpos, # Language-specific part-of-speech tag
      feats, # Morphological features of the token.
      tid_source, # Token ID in the source document.
      relation, # Dependency relation to the head of the token.
      project_name,  # Name of the parent token.        # [FROM projs_train]
      regionname            ,
      countryname           ,
      projectstatusdisplay,                             # [FROM projs_train] 
      boardapprovaldate,                   # [FROM projs_train]
      boardapprovalFY,                    # [FROM projs_train] 
      closingdate,                         # [FROM projs_train]
      closingdateFY,                      # [FROM projs_train]
      sector1               ,       # [FROM projs_train]
      theme1                ,       # [FROM projs_train]
      borrower,                    # [FROM projs_train]
      lendinginstr          ,       # [FROM projs_train]
      envassesmentcategorycode ,       # [FROM projs_train]
      esrc_ovrl_risk_rate ,       # [FROM projs_train]
      
      curr_total_commitment        # [FROM projs_train]
   )

Ensure tid is numeric

# Ensure token_id is numeric
pdo_train_t <- pdo_train_t %>% 
  mutate(tid = as.numeric(tid))  # Convert token_id to numeric

Add low case token

pdo_train_t <- pdo_train_t %>% 
  mutate(token_l = tolower(token)) %>% 
   relocate(token_l, .after = token) %>% 
   select(-token_with_ws) 

Restore variations of “hyphenword” with “-”

#Replace variations of "hyphenword" with "-"
pdo_train_t <- pdo_train_t %>% 
  mutate(
    lemma = str_replace_all(lemma, regex("hyphenword|hyphenwor", 
                                         ignore_case = TRUE), "-")
  )

_______

Save clean Proj PDO train dataset pdo_train_t

# inputs 
saveRDS(projs_train, here("data" , "derived_data", "projs_train.rds"))#   4403 -> 5637  obs 
saveRDS(projs_test, here("data" , "derived_data", "proj_test.rds"))   #   2204 --> 2821 obs 
saveRDS(projs_val, here("data" , "derived_data", "projs_val.rds"))    #   2204 --> 2820 obs 
# projs_train + annotated pdo train --> 
saveRDS(pdo_train_t, here("data" , "derived_data", "pdo_train_t.rds"))#  249K --> 314K obs