# This script was written for data analysis reported in the manuscript:
# Freshwater trematodes differ from marine trematodes in patterns connected with 
#   Division of Labor
# By Allison T Neal, Moira Stettner, Renytzabelle Ortega-Cotto, Daniel Dieringer,
#   and Lydia Reed
# Submitted to PeerJ in October 2023
# The script was written by Allison Neal and is provided to allow easy replication
#   and critical evaluation of the analysis performed.  The script author does not
#   claim to be an efficient or expert R coder, so please excuse any inefficiencies 
#   in the code.

# This script imports data and cleans it up. It does not perform any analysis, but
#   all other scripts that perform analysis on this data use this as a starting point.

### MetaData ###
# Trematodes in the accompanying data file were collected from Vermont snails
# in summers 2019 and 2021 by Allison Neal and her students (2019: Mo Stettner, Mary
# Nsubuga, Kathryn Farnum, Dan Dieringer [October], Caleb Scully [October] 
# 2021: Lydia Reed, Reine Gibb and Macall Meslin [one trip]).  Details on snail 
# collections may be found in databooks kept by Allison Neal

# In the data set, columns are:
#  snail- unique identifier assigned to infected snails; letter indicates snail family
#     H- Hydrobiidae
#     P- Planorbidae
#     Ph- Physidae
#     V- Viviparidae
#  redia- identifier for redia within a snail and body section (location); corresponds to 
#     photos.
#     Numbers with letters only appended (e.g. 'a','b','c') indicate that there were more rediae
#      present in the photo than indicated in the photo's label; these are separate rediae
#     Numbers with 'm10x'appended indicate that there was a duplicate photo take of the redia 
#      at a higher magnification (100x [10x objective] vs. 40x); these are duplicate photos. 
#     Measurements taken at the higher magnification are likely more accurate, so only those
#      will be retained in the final analysis (see Clean Up code below)
#     Numbers beginning with 'C2r' were taken on a different microscope (and computer- computer 2)
#      These are unique redia numbers.
#     Snail V172 has three rediae named A, B, C.  These are unique.  I believe they were collected 
#      during a different stage of the dissection than rediae 1-10.  They could be included in analysis, but 
#      none of the rediae in this infection are really a random or thorough sample (see Neal lab notebook,
#      from July 15, 2021)
#     Similarly, snail V182 has rediae with names starting with a, b, c, d.  I believe they were also collected
#      during a different stage of the dissection.  The numbers indicate the multiple rediae in photos a, b, c
#      and d.  Not random, or thoroughly sampled, but unique.
#     Snail P153 has two extra measurements (rediae are labeled 'extra1' and 'extra2') that were taken 
#      from photos of attack trials.  These should not be included in the general analysis for this infection 
#      and will be excluded when this infection is analyzed.
#  measurer- researcher who performed the measurements recorded.
#     MS - Moira Stettner
#     ATN- Allison T Neal
#     Note: there was a slight difference in the measurements of these two researchers, likely 
#     attributed to slight deviations in the calibration of their software. 
#     For a sample of 50 redia measured by both researchers, Neal's measurements were consistently
#     around 2-6% higher.  No correction was deemed necessary as no individual infection was measured
#     partially by each researcher.
#  length- length (micrometers) measured from photos using a segmented line in SPOT (MS) or ImageJ 
#     software (ATN).
#  width- length (micrometers) at widest point on main body (i.e. not an appendage/collar) 
#     measured from photos in SPOT (MS) or ImageJ software (ATN).
#  pharynx- width (micrometers) at widest point of pharynx measured from photos in SPOT (MS) or
#     ImageJ (ATN) 
#  location- where in the snail the sample of redia were taken from. Entries are:
#     foot- snail head/foot and mantle
#     mid- section between foot and gonad/digestive
#     gonad- posterior end of snail containing gonad/digestive organs
#     general or gonad/Mid or mid/Foot- body sections (or specified body sections) were not separated
#     Slide1 or Slide 2- body sections were not separated, but two samples were taken and numbered independently
#      (i.e. same as general, but separate designations were maintained for easy reference to photos)
#  notes- notes recorded by measurer; some describe redia that may be damaged or show interesting features;
#     relevant notes to consider for analysis may include:
#       flat/flattened- slide may have been prepared with too little water; measurements maybe inconsistent
#       with other rediae from this snail.
#  unreliable- measurements that were judged unreliable by the measurer
#     L- length
#     W- width
#     P- pharynx
#     U- unknown (some samples were measured before we started specifying which measurements
#     were unreliable; all are now recorded as unknown, which should probably mean all measurements
#     are excluded for these samples)
#  magnification- some samples may have the magnification recorded (esp. if they're not 40x, which is
#     most typical)
#########################################################################################################
### Import Data and Clean Up ###
data<-read.csv('/Users/aneal1/OneDrive - Norwich University/2_Research/1_Trematode Research/Div of Labor/Analysis/Redial Sizes/DOLVTsizes.csv',head=T,sep=',')

# Snail should be a factor variable
data$snail<-as.factor(data$snail)

# Remove duplicate rediae (a few rediae were measured from multiple photos with different magnification)
data$sample<-paste(data$snail,data$location,data$redia,sep='_') # Generates what should be unique sample numbers for every redia (there should be no rediae with the same snail, redia and location values combined)
length(data$sample)==length(unique(data$sample)) # Checks if all samples are unique; they are

grep('m',data$redia) # Identify photos with different magnification; for all samples, duplicate measurement is one row up (I checked)
dup<-grep('m',data$redia)-1 # indices for the values I want removed
data<-subset(data,!(1:dim(data)[1] %in% dup))


# Look at ranges of length, width and height to make sure ranges make sense and no values were entered incorrectly (enough to be outside reasonable bounds)
summary(data$length)
summary(data$width)
summary(data$pharynx) 
data[order(data$pharynx),]  # to see smallest/largest
# Note about pharynx measurements:
#   It appears that Mo may have not always measured the longer of the two
#   dimensions on the pharynges.  Correcting this would require me to repeat
#   all of Mo's measurements, which might itself introduce issues because there
#   seems to be a slight discrepancy in the calibration of my software and hers
#   (so I might actually have to repeat ALL of her measurements).  I think it's ok.
#   Mo was consistent with how she took measurements at least within
#   a given infection (that seems to be reflected in a subset of the measurements
#   I did go back and try to repeat- the relative sizes are mostly fine), so 
#   comparisions within a given infection (which is most of the analysis) should
#   be ok but we should proceed with caution when comparing amoung infections.


# Look at how body regions were entered and fix inconsistencies
table(data$location)
data$location[data$location=='Foot']<-'foot'
data$location[data$location=='General']<-'general'
data$location[data$location=='Gonad']<-'gonad'
data$location[data$location=='Mid']<-'mid'
data$location[data$location=='Mid+Foot']<-'midFoot'
data$location[data$location=='MidFoot']<-'midFoot'


# Consider what to do with issues in notes

# One issue: flattened rediae (too little water used when preparing slide)
#    I reviewed the notes and found that two words described these: flat and flattened
#    It did not appear that there were any comments indicating something was "not flat"
#    There were two redia that were "squished", but I'm not sure they're worth removing
#   For now, create variable so the flat/flattened samples can easily be removed from analysis
data$flat<-"N"
data$flat[grep('flat',data$notes)]<-"Y" #changes all samples with "flat" (or flattened) in the notes

# Second issue: damaged; this is already recorded more succinctly in 'unreliable' column
table(data$unreliable) #summary of what should be removed

# Third issue: things that are marked as "not a redia" or some variation on this
data$notes[grep('redia',data$notes)] # start by getting a list of all the different ways this is phrased
#   I will set these to having unreliable measurements (in case any measurements were recorded)
#   "unclear whether this is actually a redia"  
data$notes[grep("unclear whether this is actually a redia",data$notes)] # check to make sure this doesn't get anything unexpected
data$unreliable[grep("unclear whether this is actually a redia",data$notes)]<-"LWP"
#   "may not be a redia" (with our without additions of "may or" or "...")
data$notes[grep("may not be a redia",data$notes)] # check to make sure this doesn't get anything unexpected
data$unreliable[grep("may not be a redia",data$notes)]<-"LWP"
#   "I'm not sure if any of these are a redia"
data$notes[grep("these are a redia",data$notes)] # check to make sure this doesn't get anything unexpected
data$unreliable[grep("these are a redia",data$notes)]<-"LWP"
# "this is a redia" cluster  (oddly, this doesn't get anything says it is a redia with confidence!)
#   "not clear if this is a redia" 
#   "not sure this is a redia" 
#   "not sure if this is a redia"
#   "I'm not confident this is a redia"
#   "I'm not positive this is a redia"
#   "I don't think this is a redia"
#   "I think this is a redia" -- sounds uncertain/unreliable
#   "I think this is a redia; pharynx out of focus" 
#   "I think this is a redia, but hard to see (out of focus, light colored)"
data$notes[grep("this is a redia",data$notes)] # check to make sure this doesn't get anything unexpected
data$unreliable[grep("this is a redia",data$notes)]<-"LWP"
# "not a redia" cluster
#    "I think this is a cercaria, not a redia" 
#    "maybe not a redia"
#    "not a redia"
#    "damaged? not a redia?"
data$notes[grep("not a redia",data$notes)] # check to make sure this doesn't get anything unexpected
data$unreliable[grep("not a redia",data$notes)]<-"LWP"


# Once all unreliable measurements have been flagged, can remove them with this:
#   Add columns with only reliable measurements for each measurement
#      Length
data$lengthR<-NA
all.rows<-1:dim(data)[1]
unrel.length<-grep('L',data$unreliable) # unreliable because length specified as unreliable
unrel.length<-c(unrel.length,grep('U',data$unreliable)) # unreliable because not specified which measurements are unreliable
unrel.length<-c(unrel.length,grep('Y',data$flat)) # unreliable because they are squished/flattened
rel.length<-all.rows[-unrel.length]
data$lengthR[rel.length]<-data$length[rel.length]
#      Width
data$widthR<-NA
unrel.width<-grep('W',data$unreliable) # unreliable because width specified as unreliable
unrel.width<-c(unrel.width,grep('U',data$unreliable)) # unreliable because not specified which measurements are unreliable
unrel.width<-c(unrel.width,grep('Y',data$flat)) # unreliable because they are squished/flattened
rel.width<-all.rows[-unrel.width]
data$widthR[rel.width]<-data$width[rel.width]
#      Pharynx
data$pharR<-NA
unrel.phar<-grep('P',data$unreliable) # unreliable because pharynx size specified as unreliable
unrel.phar<-c(unrel.phar,grep('U',data$unreliable)) # unreliable because not specified which measurements are unreliable
unrel.phar<-c(unrel.phar,grep('Y',data$flat)) # unreliable because they are squished/flattened
rel.phar<-all.rows[-unrel.phar]
data$pharR[rel.phar]<-data$pharynx[rel.phar]