1 /* 2 * Copyright 2006-2010, Haiku, Inc. All Rights Reserved. 3 * Distributed under the terms of the MIT License. 4 * 5 * Authors: 6 * Andrew Galante, haiku.galante@gmail.com 7 * Axel Dörfler, axeld@pinc-software.de 8 * Hugo Santos, hugosantos@gmail.com 9 */ 10 11 12 #include "TCPEndpoint.h" 13 14 #include <netinet/in.h> 15 #include <netinet/ip.h> 16 #include <netinet/tcp.h> 17 #include <new> 18 #include <signal.h> 19 #include <stdlib.h> 20 #include <string.h> 21 22 #include <KernelExport.h> 23 #include <Select.h> 24 25 #include <net_buffer.h> 26 #include <net_datalink.h> 27 #include <net_stat.h> 28 #include <NetBufferUtilities.h> 29 #include <NetUtilities.h> 30 31 #include <lock.h> 32 #include <tracing.h> 33 #include <util/AutoLock.h> 34 #include <util/list.h> 35 36 #include "EndpointManager.h" 37 38 39 // References: 40 // - RFC 793 - Transmission Control Protocol 41 // - RFC 813 - Window and Acknowledgement Strategy in TCP 42 // - RFC 1337 - TIME_WAIT Assassination Hazards in TCP 43 // 44 // Things this implementation currently doesn't implement: 45 // - TCP Slow Start, Congestion Avoidance, Fast Retransmit, and Fast Recovery, 46 // RFC 2001, RFC 2581, RFC 3042 47 // - NewReno Modification to TCP's Fast Recovery, RFC 2582 48 // - Explicit Congestion Notification (ECN), RFC 3168 49 // - SYN-Cache 50 // - SACK, Selective Acknowledgment - RFC 2018, RFC 2883, RFC 3517 51 // - Forward RTO-Recovery, RFC 4138 52 // - Time-Wait hash instead of keeping sockets alive 53 // 54 // Things incomplete in this implementation: 55 // - TCP Extensions for High Performance, RFC 1323 - RTTM, PAWS 56 57 #define PrintAddress(address) \ 58 AddressString(Domain(), address, true).Data() 59 60 //#define TRACE_TCP 61 //#define PROBE_TCP 62 63 #ifdef TRACE_TCP 64 // the space before ', ##args' is important in order for this to work with cpp 2.95 65 # define TRACE(format, args...) dprintf("%" B_PRId32 ": TCP [%" \ 66 B_PRIdBIGTIME "] %p (%12s) " format "\n", find_thread(NULL), \ 67 system_time(), this, name_for_state(fState) , ##args) 68 #else 69 # define TRACE(args...) do { } while (0) 70 #endif 71 72 #ifdef PROBE_TCP 73 # define PROBE(buffer, window) \ 74 dprintf("TCP PROBE %" B_PRIdBIGTIME " %s %s %" B_PRIu32 " snxt %" B_PRIu32 \ 75 " suna %" B_PRIu32 " cw %" B_PRIu32 " sst %" B_PRIu32 " win %" \ 76 B_PRIu32 " swin %" B_PRIu32 " smax-suna %" B_PRIu32 " savail %" \ 77 B_PRIuSIZE " sqused %" B_PRIuSIZE " rto %" B_PRIdBIGTIME "\n", \ 78 system_time(), PrintAddress(buffer->source), \ 79 PrintAddress(buffer->destination), buffer->size, fSendNext.Number(), \ 80 fSendUnacknowledged.Number(), fCongestionWindow, fSlowStartThreshold, \ 81 window, fSendWindow, (fSendMax - fSendUnacknowledged).Number(), \ 82 fSendQueue.Available(fSendNext), fSendQueue.Used(), fRetransmitTimeout) 83 #else 84 # define PROBE(buffer, window) do { } while (0) 85 #endif 86 87 #if TCP_TRACING 88 namespace TCPTracing { 89 90 class Receive : public AbstractTraceEntry { 91 public: 92 Receive(TCPEndpoint* endpoint, tcp_segment_header& segment, uint32 window, 93 net_buffer* buffer) 94 : 95 fEndpoint(endpoint), 96 fBuffer(buffer), 97 fBufferSize(buffer->size), 98 fSequence(segment.sequence), 99 fAcknowledge(segment.acknowledge), 100 fWindow(window), 101 fState(endpoint->State()), 102 fFlags(segment.flags) 103 { 104 Initialized(); 105 } 106 107 virtual void AddDump(TraceOutput& out) 108 { 109 out.Print("tcp:%p (%12s) receive buffer %p (%" B_PRIu32 " bytes), " 110 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 111 ", wnd %" B_PRIu32, fEndpoint, name_for_state(fState), fBuffer, 112 fBufferSize, fFlags, fSequence, fAcknowledge, fWindow); 113 } 114 115 protected: 116 TCPEndpoint* fEndpoint; 117 net_buffer* fBuffer; 118 uint32 fBufferSize; 119 uint32 fSequence; 120 uint32 fAcknowledge; 121 uint32 fWindow; 122 tcp_state fState; 123 uint8 fFlags; 124 }; 125 126 class Send : public AbstractTraceEntry { 127 public: 128 Send(TCPEndpoint* endpoint, tcp_segment_header& segment, net_buffer* buffer, 129 tcp_sequence firstSequence, tcp_sequence lastSequence) 130 : 131 fEndpoint(endpoint), 132 fBuffer(buffer), 133 fBufferSize(buffer->size), 134 fSequence(segment.sequence), 135 fAcknowledge(segment.acknowledge), 136 fFirstSequence(firstSequence.Number()), 137 fLastSequence(lastSequence.Number()), 138 fState(endpoint->State()), 139 fFlags(segment.flags) 140 { 141 Initialized(); 142 } 143 144 virtual void AddDump(TraceOutput& out) 145 { 146 out.Print("tcp:%p (%12s) send buffer %p (%" B_PRIu32 " bytes), " 147 "flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 148 ", first %" B_PRIu32 ", last %" B_PRIu32, fEndpoint, 149 name_for_state(fState), fBuffer, fBufferSize, fFlags, fSequence, 150 fAcknowledge, fFirstSequence, fLastSequence); 151 } 152 153 protected: 154 TCPEndpoint* fEndpoint; 155 net_buffer* fBuffer; 156 uint32 fBufferSize; 157 uint32 fSequence; 158 uint32 fAcknowledge; 159 uint32 fFirstSequence; 160 uint32 fLastSequence; 161 tcp_state fState; 162 uint8 fFlags; 163 }; 164 165 class State : public AbstractTraceEntry { 166 public: 167 State(TCPEndpoint* endpoint) 168 : 169 fEndpoint(endpoint), 170 fState(endpoint->State()) 171 { 172 Initialized(); 173 } 174 175 virtual void AddDump(TraceOutput& out) 176 { 177 out.Print("tcp:%p (%12s) state change", fEndpoint, 178 name_for_state(fState)); 179 } 180 181 protected: 182 TCPEndpoint* fEndpoint; 183 tcp_state fState; 184 }; 185 186 class Spawn : public AbstractTraceEntry { 187 public: 188 Spawn(TCPEndpoint* listeningEndpoint, TCPEndpoint* spawnedEndpoint) 189 : 190 fListeningEndpoint(listeningEndpoint), 191 fSpawnedEndpoint(spawnedEndpoint) 192 { 193 Initialized(); 194 } 195 196 virtual void AddDump(TraceOutput& out) 197 { 198 out.Print("tcp:%p spawns %p", fListeningEndpoint, fSpawnedEndpoint); 199 } 200 201 protected: 202 TCPEndpoint* fListeningEndpoint; 203 TCPEndpoint* fSpawnedEndpoint; 204 }; 205 206 class Error : public AbstractTraceEntry { 207 public: 208 Error(TCPEndpoint* endpoint, const char* error, int32 line) 209 : 210 fEndpoint(endpoint), 211 fLine(line), 212 fError(error), 213 fState(endpoint->State()) 214 { 215 Initialized(); 216 } 217 218 virtual void AddDump(TraceOutput& out) 219 { 220 out.Print("tcp:%p (%12s) error at line %" B_PRId32 ": %s", fEndpoint, 221 name_for_state(fState), fLine, fError); 222 } 223 224 protected: 225 TCPEndpoint* fEndpoint; 226 int32 fLine; 227 const char* fError; 228 tcp_state fState; 229 }; 230 231 class TimerSet : public AbstractTraceEntry { 232 public: 233 TimerSet(TCPEndpoint* endpoint, const char* which, bigtime_t timeout) 234 : 235 fEndpoint(endpoint), 236 fWhich(which), 237 fTimeout(timeout), 238 fState(endpoint->State()) 239 { 240 Initialized(); 241 } 242 243 virtual void AddDump(TraceOutput& out) 244 { 245 out.Print("tcp:%p (%12s) %s timer set to %" B_PRIdBIGTIME, fEndpoint, 246 name_for_state(fState), fWhich, fTimeout); 247 } 248 249 protected: 250 TCPEndpoint* fEndpoint; 251 const char* fWhich; 252 bigtime_t fTimeout; 253 tcp_state fState; 254 }; 255 256 class TimerTriggered : public AbstractTraceEntry { 257 public: 258 TimerTriggered(TCPEndpoint* endpoint, const char* which) 259 : 260 fEndpoint(endpoint), 261 fWhich(which), 262 fState(endpoint->State()) 263 { 264 Initialized(); 265 } 266 267 virtual void AddDump(TraceOutput& out) 268 { 269 out.Print("tcp:%p (%12s) %s timer triggered", fEndpoint, 270 name_for_state(fState), fWhich); 271 } 272 273 protected: 274 TCPEndpoint* fEndpoint; 275 const char* fWhich; 276 tcp_state fState; 277 }; 278 279 class APICall : public AbstractTraceEntry { 280 public: 281 APICall(TCPEndpoint* endpoint, const char* which) 282 : 283 fEndpoint(endpoint), 284 fWhich(which), 285 fState(endpoint->State()) 286 { 287 Initialized(); 288 } 289 290 virtual void AddDump(TraceOutput& out) 291 { 292 out.Print("tcp:%p (%12s) api call: %s", fEndpoint, 293 name_for_state(fState), fWhich); 294 } 295 296 protected: 297 TCPEndpoint* fEndpoint; 298 const char* fWhich; 299 tcp_state fState; 300 }; 301 302 } // namespace TCPTracing 303 304 # define T(x) new(std::nothrow) TCPTracing::x 305 #else 306 # define T(x) 307 #endif // TCP_TRACING 308 309 310 // constants for the fFlags field 311 enum { 312 FLAG_OPTION_WINDOW_SCALE = 0x01, 313 FLAG_OPTION_TIMESTAMP = 0x02, 314 // TODO: Should FLAG_NO_RECEIVE apply as well to received connections? 315 // That is, what is expected from accept() after a shutdown() 316 // is performed on a listen()ing socket. 317 FLAG_NO_RECEIVE = 0x04, 318 FLAG_CLOSED = 0x08, 319 FLAG_DELETE_ON_CLOSE = 0x10, 320 FLAG_LOCAL = 0x20 321 }; 322 323 324 static const int kTimestampFactor = 1000; 325 // conversion factor between usec system time and msec tcp time 326 327 328 static inline bigtime_t 329 absolute_timeout(bigtime_t timeout) 330 { 331 if (timeout == 0 || timeout == B_INFINITE_TIMEOUT) 332 return timeout; 333 334 return timeout + system_time(); 335 } 336 337 338 static inline status_t 339 posix_error(status_t error) 340 { 341 if (error == B_TIMED_OUT) 342 return B_WOULD_BLOCK; 343 344 return error; 345 } 346 347 348 static inline bool 349 in_window(const tcp_sequence& sequence, const tcp_sequence& receiveNext, 350 uint32 receiveWindow) 351 { 352 return sequence >= receiveNext && sequence < (receiveNext + receiveWindow); 353 } 354 355 356 static inline bool 357 segment_in_sequence(const tcp_segment_header& segment, int size, 358 const tcp_sequence& receiveNext, uint32 receiveWindow) 359 { 360 tcp_sequence sequence(segment.sequence); 361 if (size == 0) { 362 if (receiveWindow == 0) 363 return sequence == receiveNext; 364 return in_window(sequence, receiveNext, receiveWindow); 365 } else { 366 if (receiveWindow == 0) 367 return false; 368 return in_window(sequence, receiveNext, receiveWindow) 369 || in_window(sequence + size - 1, receiveNext, receiveWindow); 370 } 371 } 372 373 374 static inline bool 375 is_writable(tcp_state state) 376 { 377 return state == ESTABLISHED || state == FINISH_RECEIVED; 378 } 379 380 381 static inline bool 382 is_establishing(tcp_state state) 383 { 384 return state == SYNCHRONIZE_SENT || state == SYNCHRONIZE_RECEIVED; 385 } 386 387 388 static inline uint32 tcp_now() 389 { 390 return system_time() / kTimestampFactor; 391 } 392 393 394 static inline uint32 tcp_diff_timestamp(uint32 base) 395 { 396 uint32 now = tcp_now(); 397 398 if (now > base) 399 return now - base; 400 401 return now + UINT_MAX - base; 402 } 403 404 405 static inline bool 406 state_needs_finish(int32 state) 407 { 408 return state == WAIT_FOR_FINISH_ACKNOWLEDGE 409 || state == FINISH_SENT || state == CLOSING; 410 } 411 412 413 // #pragma mark - 414 415 416 TCPEndpoint::TCPEndpoint(net_socket* socket) 417 : 418 ProtocolSocket(socket), 419 fManager(NULL), 420 fOptions(0), 421 fSendWindowShift(0), 422 fReceiveWindowShift(0), 423 fSendUnacknowledged(0), 424 fSendNext(0), 425 fSendMax(0), 426 fSendUrgentOffset(0), 427 fSendWindow(0), 428 fSendMaxWindow(0), 429 fSendMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 430 fSendQueue(socket->send.buffer_size), 431 fInitialSendSequence(0), 432 fDuplicateAcknowledgeCount(0), 433 fRoute(NULL), 434 fReceiveNext(0), 435 fReceiveMaxAdvertised(0), 436 fReceiveWindow(socket->receive.buffer_size), 437 fReceiveMaxSegmentSize(TCP_DEFAULT_MAX_SEGMENT_SIZE), 438 fReceiveQueue(socket->receive.buffer_size), 439 fRoundTripTime(TCP_INITIAL_RTT / kTimestampFactor), 440 fRoundTripDeviation(TCP_INITIAL_RTT / kTimestampFactor), 441 fRetransmitTimeout(TCP_INITIAL_RTT), 442 fReceivedTimestamp(0), 443 fCongestionWindow(0), 444 fSlowStartThreshold(0), 445 fState(CLOSED), 446 fFlags(FLAG_OPTION_WINDOW_SCALE | FLAG_OPTION_TIMESTAMP) 447 { 448 // TODO: to be replaced with a real read/write locking strategy! 449 mutex_init(&fLock, "tcp lock"); 450 451 fReceiveCondition.Init(this, "tcp receive"); 452 fSendCondition.Init(this, "tcp send"); 453 454 gStackModule->init_timer(&fPersistTimer, TCPEndpoint::_PersistTimer, this); 455 gStackModule->init_timer(&fRetransmitTimer, TCPEndpoint::_RetransmitTimer, 456 this); 457 gStackModule->init_timer(&fDelayedAcknowledgeTimer, 458 TCPEndpoint::_DelayedAcknowledgeTimer, this); 459 gStackModule->init_timer(&fTimeWaitTimer, TCPEndpoint::_TimeWaitTimer, 460 this); 461 462 T(APICall(this, "constructor")); 463 } 464 465 466 TCPEndpoint::~TCPEndpoint() 467 { 468 mutex_lock(&fLock); 469 470 T(APICall(this, "destructor")); 471 472 _CancelConnectionTimers(); 473 gStackModule->cancel_timer(&fTimeWaitTimer); 474 T(TimerSet(this, "time-wait", -1)); 475 476 if (fManager != NULL) { 477 fManager->Unbind(this); 478 put_endpoint_manager(fManager); 479 } 480 481 mutex_destroy(&fLock); 482 483 // we need to wait for all timers to return 484 gStackModule->wait_for_timer(&fRetransmitTimer); 485 gStackModule->wait_for_timer(&fPersistTimer); 486 gStackModule->wait_for_timer(&fDelayedAcknowledgeTimer); 487 gStackModule->wait_for_timer(&fTimeWaitTimer); 488 489 gDatalinkModule->put_route(Domain(), fRoute); 490 } 491 492 493 status_t 494 TCPEndpoint::InitCheck() const 495 { 496 return B_OK; 497 } 498 499 500 // #pragma mark - protocol API 501 502 503 status_t 504 TCPEndpoint::Open() 505 { 506 TRACE("Open()"); 507 T(APICall(this, "open")); 508 509 status_t status = ProtocolSocket::Open(); 510 if (status < B_OK) 511 return status; 512 513 fManager = get_endpoint_manager(Domain()); 514 if (fManager == NULL) 515 return EAFNOSUPPORT; 516 517 return B_OK; 518 } 519 520 521 status_t 522 TCPEndpoint::Close() 523 { 524 MutexLocker locker(fLock); 525 526 TRACE("Close()"); 527 T(APICall(this, "close")); 528 529 if (fState == LISTEN) 530 delete_sem(fAcceptSemaphore); 531 532 if (fState == SYNCHRONIZE_SENT || fState == LISTEN) { 533 // TODO: what about linger in case of SYNCHRONIZE_SENT? 534 fState = CLOSED; 535 T(State(this)); 536 return B_OK; 537 } 538 539 status_t status = _Disconnect(true); 540 if (status != B_OK) 541 return status; 542 543 if (socket->options & SO_LINGER) { 544 TRACE("Close(): Lingering for %i secs", socket->linger); 545 546 bigtime_t maximum = absolute_timeout(socket->linger * 1000000LL); 547 548 while (fSendQueue.Used() > 0) { 549 status = _WaitForCondition(fSendCondition, locker, maximum); 550 if (status == B_TIMED_OUT || status == B_WOULD_BLOCK) 551 break; 552 else if (status < B_OK) 553 return status; 554 } 555 556 TRACE("Close(): after waiting, the SendQ was left with %" B_PRIuSIZE 557 " bytes.", fSendQueue.Used()); 558 } 559 return B_OK; 560 } 561 562 563 void 564 TCPEndpoint::Free() 565 { 566 MutexLocker _(fLock); 567 568 TRACE("Free()"); 569 T(APICall(this, "free")); 570 571 if (fState <= SYNCHRONIZE_SENT) 572 return; 573 574 // we are only interested in the timer, not in changing state 575 _EnterTimeWait(); 576 577 fFlags |= FLAG_CLOSED; 578 if ((fFlags & FLAG_DELETE_ON_CLOSE) == 0) { 579 // we'll be freed later when the 2MSL timer expires 580 gSocketModule->acquire_socket(socket); 581 } 582 } 583 584 585 /*! Creates and sends a synchronize packet to /a address, and then waits 586 until the connection has been established or refused. 587 */ 588 status_t 589 TCPEndpoint::Connect(const sockaddr* address) 590 { 591 if (!AddressModule()->is_same_family(address)) 592 return EAFNOSUPPORT; 593 594 MutexLocker locker(fLock); 595 596 TRACE("Connect() on address %s", PrintAddress(address)); 597 T(APICall(this, "connect")); 598 599 if (gStackModule->is_restarted_syscall()) { 600 bigtime_t timeout = gStackModule->restore_syscall_restart_timeout(); 601 status_t status = _WaitForEstablished(locker, timeout); 602 TRACE(" Connect(): Connection complete: %s (timeout was %" 603 B_PRIdBIGTIME ")", strerror(status), timeout); 604 return posix_error(status); 605 } 606 607 // Can only call connect() from CLOSED or LISTEN states 608 // otherwise endpoint is considered already connected 609 if (fState == LISTEN) { 610 // this socket is about to connect; remove pending connections in the backlog 611 gSocketModule->set_max_backlog(socket, 0); 612 } else if (fState == ESTABLISHED) { 613 return EISCONN; 614 } else if (fState != CLOSED) 615 return EINPROGRESS; 616 617 // consider destination address INADDR_ANY as INADDR_LOOPBACK 618 sockaddr_storage _address; 619 if (AddressModule()->is_empty_address(address, false)) { 620 AddressModule()->get_loopback_address((sockaddr *)&_address); 621 // for IPv4 and IPv6 the port is at the same offset 622 ((sockaddr_in &)_address).sin_port = ((sockaddr_in *)address)->sin_port; 623 address = (sockaddr *)&_address; 624 } 625 626 status_t status = _PrepareSendPath(address); 627 if (status < B_OK) 628 return status; 629 630 TRACE(" Connect(): starting 3-way handshake..."); 631 632 fState = SYNCHRONIZE_SENT; 633 T(State(this)); 634 635 // send SYN 636 status = _SendQueued(); 637 if (status != B_OK) { 638 _Close(); 639 return status; 640 } 641 642 // If we are running over Loopback, after _SendQueued() returns we 643 // may be in ESTABLISHED already. 644 if (fState == ESTABLISHED) { 645 TRACE(" Connect() completed after _SendQueued()"); 646 return B_OK; 647 } 648 649 // wait until 3-way handshake is complete (if needed) 650 bigtime_t timeout = min_c(socket->send.timeout, TCP_CONNECTION_TIMEOUT); 651 if (timeout == 0) { 652 // we're a non-blocking socket 653 TRACE(" Connect() delayed, return EINPROGRESS"); 654 return EINPROGRESS; 655 } 656 657 bigtime_t absoluteTimeout = absolute_timeout(timeout); 658 gStackModule->store_syscall_restart_timeout(absoluteTimeout); 659 660 status = _WaitForEstablished(locker, absoluteTimeout); 661 TRACE(" Connect(): Connection complete: %s (timeout was %" B_PRIdBIGTIME 662 ")", strerror(status), timeout); 663 return posix_error(status); 664 } 665 666 667 status_t 668 TCPEndpoint::Accept(struct net_socket** _acceptedSocket) 669 { 670 MutexLocker locker(fLock); 671 672 TRACE("Accept()"); 673 T(APICall(this, "accept")); 674 675 status_t status; 676 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 677 if (gStackModule->is_restarted_syscall()) 678 timeout = gStackModule->restore_syscall_restart_timeout(); 679 else 680 gStackModule->store_syscall_restart_timeout(timeout); 681 682 do { 683 locker.Unlock(); 684 685 status = acquire_sem_etc(fAcceptSemaphore, 1, B_ABSOLUTE_TIMEOUT 686 | B_CAN_INTERRUPT, timeout); 687 if (status != B_OK) { 688 if (status == B_TIMED_OUT && socket->receive.timeout == 0) 689 return B_WOULD_BLOCK; 690 691 return status; 692 } 693 694 locker.Lock(); 695 status = gSocketModule->dequeue_connected(socket, _acceptedSocket); 696 #ifdef TRACE_TCP 697 if (status == B_OK) 698 TRACE(" Accept() returning %p", (*_acceptedSocket)->first_protocol); 699 #endif 700 } while (status != B_OK); 701 702 return status; 703 } 704 705 706 status_t 707 TCPEndpoint::Bind(const sockaddr *address) 708 { 709 if (address == NULL) 710 return B_BAD_VALUE; 711 712 MutexLocker lock(fLock); 713 714 TRACE("Bind() on address %s", PrintAddress(address)); 715 T(APICall(this, "bind")); 716 717 if (fState != CLOSED) 718 return EISCONN; 719 720 return fManager->Bind(this, address); 721 } 722 723 724 status_t 725 TCPEndpoint::Unbind(struct sockaddr *address) 726 { 727 MutexLocker _(fLock); 728 729 TRACE("Unbind()"); 730 T(APICall(this, "unbind")); 731 732 return fManager->Unbind(this); 733 } 734 735 736 status_t 737 TCPEndpoint::Listen(int count) 738 { 739 MutexLocker _(fLock); 740 741 TRACE("Listen()"); 742 T(APICall(this, "listen")); 743 744 if (fState != CLOSED && fState != LISTEN) 745 return B_BAD_VALUE; 746 747 if (fState == CLOSED) { 748 fAcceptSemaphore = create_sem(0, "tcp accept"); 749 if (fAcceptSemaphore < B_OK) 750 return ENOBUFS; 751 752 status_t status = fManager->SetPassive(this); 753 if (status != B_OK) { 754 delete_sem(fAcceptSemaphore); 755 fAcceptSemaphore = -1; 756 return status; 757 } 758 } 759 760 gSocketModule->set_max_backlog(socket, count); 761 762 fState = LISTEN; 763 T(State(this)); 764 return B_OK; 765 } 766 767 768 status_t 769 TCPEndpoint::Shutdown(int direction) 770 { 771 MutexLocker lock(fLock); 772 773 TRACE("Shutdown(%i)", direction); 774 T(APICall(this, "shutdown")); 775 776 if (direction == SHUT_RD || direction == SHUT_RDWR) 777 fFlags |= FLAG_NO_RECEIVE; 778 779 if (direction == SHUT_WR || direction == SHUT_RDWR) { 780 // TODO: That's not correct. After read/write shutting down the socket 781 // one should still be able to read previously arrived data. 782 _Disconnect(false); 783 } 784 785 return B_OK; 786 } 787 788 789 /*! Puts data contained in \a buffer into send buffer */ 790 status_t 791 TCPEndpoint::SendData(net_buffer *buffer) 792 { 793 MutexLocker lock(fLock); 794 795 TRACE("SendData(buffer %p, size %" B_PRIu32 ", flags %#" B_PRIx32 796 ") [total %" B_PRIuSIZE " bytes, has %" B_PRIuSIZE "]", buffer, 797 buffer->size, buffer->flags, fSendQueue.Size(), fSendQueue.Free()); 798 T(APICall(this, "senddata")); 799 800 uint32 flags = buffer->flags; 801 802 if (fState == CLOSED) 803 return ENOTCONN; 804 if (fState == LISTEN) 805 return EDESTADDRREQ; 806 if (!is_writable(fState) && !is_establishing(fState)) { 807 // we only send signals when called from userland 808 if (gStackModule->is_syscall() && (flags & MSG_NOSIGNAL) == 0) 809 send_signal(find_thread(NULL), SIGPIPE); 810 return EPIPE; 811 } 812 813 size_t left = buffer->size; 814 815 bigtime_t timeout = absolute_timeout(socket->send.timeout); 816 if (gStackModule->is_restarted_syscall()) 817 timeout = gStackModule->restore_syscall_restart_timeout(); 818 else 819 gStackModule->store_syscall_restart_timeout(timeout); 820 821 while (left > 0) { 822 while (fSendQueue.Free() < socket->send.low_water_mark) { 823 // wait until enough space is available 824 status_t status = _WaitForCondition(fSendCondition, lock, timeout); 825 if (status < B_OK) { 826 TRACE(" SendData() returning %s (%d)", 827 strerror(posix_error(status)), (int)posix_error(status)); 828 return posix_error(status); 829 } 830 831 if (!is_writable(fState) && !is_establishing(fState)) { 832 // we only send signals when called from userland 833 if (gStackModule->is_syscall()) 834 send_signal(find_thread(NULL), SIGPIPE); 835 return EPIPE; 836 } 837 } 838 839 size_t size = fSendQueue.Free(); 840 if (size < left) { 841 // we need to split the original buffer 842 net_buffer* clone = gBufferModule->clone(buffer, false); 843 // TODO: add offset/size parameter to net_buffer::clone() or 844 // even a move_data() function, as this is a bit inefficient 845 if (clone == NULL) 846 return ENOBUFS; 847 848 status_t status = gBufferModule->trim(clone, size); 849 if (status != B_OK) { 850 gBufferModule->free(clone); 851 return status; 852 } 853 854 gBufferModule->remove_header(buffer, size); 855 left -= size; 856 fSendQueue.Add(clone); 857 } else { 858 left -= buffer->size; 859 fSendQueue.Add(buffer); 860 } 861 } 862 863 TRACE(" SendData(): %" B_PRIuSIZE " bytes used.", fSendQueue.Used()); 864 865 bool force = false; 866 if ((flags & MSG_OOB) != 0) { 867 fSendUrgentOffset = fSendQueue.LastSequence(); 868 // RFC 961 specifies that the urgent offset points to the last 869 // byte of urgent data. However, this is commonly implemented as 870 // here, ie. it points to the first byte after the urgent data. 871 force = true; 872 } 873 if ((flags & MSG_EOF) != 0) 874 _Disconnect(false); 875 876 if (fState == ESTABLISHED || fState == FINISH_RECEIVED) 877 _SendQueued(force); 878 879 return B_OK; 880 } 881 882 883 ssize_t 884 TCPEndpoint::SendAvailable() 885 { 886 MutexLocker locker(fLock); 887 888 ssize_t available; 889 890 if (is_writable(fState)) 891 available = fSendQueue.Free(); 892 else if (is_establishing(fState)) 893 available = 0; 894 else 895 available = EPIPE; 896 897 TRACE("SendAvailable(): %" B_PRIdSSIZE, available); 898 T(APICall(this, "sendavailable")); 899 return available; 900 } 901 902 903 status_t 904 TCPEndpoint::FillStat(net_stat *stat) 905 { 906 MutexLocker _(fLock); 907 908 strlcpy(stat->state, name_for_state(fState), sizeof(stat->state)); 909 stat->receive_queue_size = fReceiveQueue.Available(); 910 stat->send_queue_size = fSendQueue.Used(); 911 912 return B_OK; 913 } 914 915 916 status_t 917 TCPEndpoint::ReadData(size_t numBytes, uint32 flags, net_buffer** _buffer) 918 { 919 MutexLocker locker(fLock); 920 921 TRACE("ReadData(%" B_PRIuSIZE " bytes, flags %#" B_PRIx32 ")", numBytes, 922 flags); 923 T(APICall(this, "readdata")); 924 925 *_buffer = NULL; 926 927 if (fState == CLOSED) 928 return ENOTCONN; 929 930 bigtime_t timeout = absolute_timeout(socket->receive.timeout); 931 if (gStackModule->is_restarted_syscall()) 932 timeout = gStackModule->restore_syscall_restart_timeout(); 933 else 934 gStackModule->store_syscall_restart_timeout(timeout); 935 936 if (fState == SYNCHRONIZE_SENT || fState == SYNCHRONIZE_RECEIVED) { 937 if (flags & MSG_DONTWAIT) 938 return B_WOULD_BLOCK; 939 940 status_t status = _WaitForEstablished(locker, timeout); 941 if (status < B_OK) 942 return posix_error(status); 943 } 944 945 size_t dataNeeded = socket->receive.low_water_mark; 946 947 // When MSG_WAITALL is set then the function should block 948 // until the full amount of data can be returned. 949 if (flags & MSG_WAITALL) 950 dataNeeded = numBytes; 951 952 // TODO: add support for urgent data (MSG_OOB) 953 954 while (true) { 955 if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE 956 || fState == TIME_WAIT) { 957 // ``Connection closing''. 958 return B_OK; 959 } 960 961 if (fReceiveQueue.Available() > 0) { 962 if (fReceiveQueue.Available() >= dataNeeded 963 || (fReceiveQueue.PushedData() > 0 964 && fReceiveQueue.PushedData() >= fReceiveQueue.Available())) 965 break; 966 } else if (fState == FINISH_RECEIVED) { 967 // ``If no text is awaiting delivery, the RECEIVE will 968 // get a Connection closing''. 969 return B_OK; 970 } 971 972 if ((flags & MSG_DONTWAIT) != 0 || socket->receive.timeout == 0) 973 return B_WOULD_BLOCK; 974 975 if ((fFlags & FLAG_NO_RECEIVE) != 0) 976 return B_OK; 977 978 status_t status = _WaitForCondition(fReceiveCondition, locker, timeout); 979 if (status < B_OK) { 980 // The Open Group base specification mentions that EINTR should be 981 // returned if the recv() is interrupted before _any data_ is 982 // available. So we actually check if there is data, and if so, 983 // push it to the user. 984 if ((status == B_TIMED_OUT || status == B_INTERRUPTED) 985 && fReceiveQueue.Available() > 0) 986 break; 987 988 return posix_error(status); 989 } 990 } 991 992 TRACE(" ReadData(): %" B_PRIuSIZE " are available.", 993 fReceiveQueue.Available()); 994 995 if (numBytes < fReceiveQueue.Available()) 996 fReceiveCondition.NotifyAll(); 997 998 bool clone = (flags & MSG_PEEK) != 0; 999 1000 ssize_t receivedBytes = fReceiveQueue.Get(numBytes, !clone, _buffer); 1001 1002 TRACE(" ReadData(): %" B_PRIuSIZE " bytes kept.", 1003 fReceiveQueue.Available()); 1004 1005 // if we are opening the window, check if we should send an ACK 1006 if (!clone) 1007 SendAcknowledge(false); 1008 1009 return receivedBytes; 1010 } 1011 1012 1013 ssize_t 1014 TCPEndpoint::ReadAvailable() 1015 { 1016 MutexLocker locker(fLock); 1017 1018 TRACE("ReadAvailable(): %" B_PRIdSSIZE, _AvailableData()); 1019 T(APICall(this, "readavailable")); 1020 1021 return _AvailableData(); 1022 } 1023 1024 1025 status_t 1026 TCPEndpoint::SetSendBufferSize(size_t length) 1027 { 1028 MutexLocker _(fLock); 1029 fSendQueue.SetMaxBytes(length); 1030 return B_OK; 1031 } 1032 1033 1034 status_t 1035 TCPEndpoint::SetReceiveBufferSize(size_t length) 1036 { 1037 MutexLocker _(fLock); 1038 fReceiveQueue.SetMaxBytes(length); 1039 return B_OK; 1040 } 1041 1042 1043 status_t 1044 TCPEndpoint::GetOption(int option, void* _value, int* _length) 1045 { 1046 if (*_length != sizeof(int)) 1047 return B_BAD_VALUE; 1048 1049 int* value = (int*)_value; 1050 1051 switch (option) { 1052 case TCP_NODELAY: 1053 if ((fOptions & TCP_NODELAY) != 0) 1054 *value = 1; 1055 else 1056 *value = 0; 1057 return B_OK; 1058 1059 case TCP_MAXSEG: 1060 *value = fReceiveMaxSegmentSize; 1061 return B_OK; 1062 1063 default: 1064 return B_BAD_VALUE; 1065 } 1066 } 1067 1068 1069 status_t 1070 TCPEndpoint::SetOption(int option, const void* _value, int length) 1071 { 1072 if (option != TCP_NODELAY) 1073 return B_BAD_VALUE; 1074 1075 if (length != sizeof(int)) 1076 return B_BAD_VALUE; 1077 1078 const int* value = (const int*)_value; 1079 1080 MutexLocker _(fLock); 1081 if (*value) 1082 fOptions |= TCP_NODELAY; 1083 else 1084 fOptions &= ~TCP_NODELAY; 1085 1086 return B_OK; 1087 } 1088 1089 1090 // #pragma mark - misc 1091 1092 1093 bool 1094 TCPEndpoint::IsBound() const 1095 { 1096 return !LocalAddress().IsEmpty(true); 1097 } 1098 1099 1100 bool 1101 TCPEndpoint::IsLocal() const 1102 { 1103 return (fFlags & FLAG_LOCAL) != 0; 1104 } 1105 1106 1107 status_t 1108 TCPEndpoint::DelayedAcknowledge() 1109 { 1110 if (gStackModule->cancel_timer(&fDelayedAcknowledgeTimer)) { 1111 // timer was active, send an ACK now (with the exception above, 1112 // we send every other ACK) 1113 T(TimerSet(this, "delayed ack", -1)); 1114 return SendAcknowledge(true); 1115 } 1116 1117 gStackModule->set_timer(&fDelayedAcknowledgeTimer, 1118 TCP_DELAYED_ACKNOWLEDGE_TIMEOUT); 1119 T(TimerSet(this, "delayed ack", TCP_DELAYED_ACKNOWLEDGE_TIMEOUT)); 1120 return B_OK; 1121 } 1122 1123 1124 status_t 1125 TCPEndpoint::SendAcknowledge(bool force) 1126 { 1127 return _SendQueued(force, 0); 1128 } 1129 1130 1131 void 1132 TCPEndpoint::_StartPersistTimer() 1133 { 1134 gStackModule->set_timer(&fPersistTimer, TCP_PERSIST_TIMEOUT); 1135 T(TimerSet(this, "persist", TCP_PERSIST_TIMEOUT)); 1136 } 1137 1138 1139 void 1140 TCPEndpoint::_EnterTimeWait() 1141 { 1142 TRACE("_EnterTimeWait()"); 1143 1144 if (fState == TIME_WAIT) { 1145 _CancelConnectionTimers(); 1146 1147 if (IsLocal()) { 1148 // we do not use TIME_WAIT state for local connections 1149 fFlags |= FLAG_DELETE_ON_CLOSE; 1150 return; 1151 } 1152 } 1153 1154 _UpdateTimeWait(); 1155 } 1156 1157 1158 void 1159 TCPEndpoint::_UpdateTimeWait() 1160 { 1161 gStackModule->set_timer(&fTimeWaitTimer, TCP_MAX_SEGMENT_LIFETIME << 1); 1162 T(TimerSet(this, "time-wait", TCP_MAX_SEGMENT_LIFETIME << 1)); 1163 } 1164 1165 1166 void 1167 TCPEndpoint::_CancelConnectionTimers() 1168 { 1169 gStackModule->cancel_timer(&fRetransmitTimer); 1170 T(TimerSet(this, "retransmit", -1)); 1171 gStackModule->cancel_timer(&fPersistTimer); 1172 T(TimerSet(this, "persist", -1)); 1173 gStackModule->cancel_timer(&fDelayedAcknowledgeTimer); 1174 T(TimerSet(this, "delayed ack", -1)); 1175 } 1176 1177 1178 /*! Sends the FIN flag to the peer when the connection is still open. 1179 Moves the endpoint to the next state depending on where it was. 1180 */ 1181 status_t 1182 TCPEndpoint::_Disconnect(bool closing) 1183 { 1184 tcp_state previousState = fState; 1185 1186 if (fState == SYNCHRONIZE_RECEIVED || fState == ESTABLISHED) 1187 fState = FINISH_SENT; 1188 else if (fState == FINISH_RECEIVED) 1189 fState = WAIT_FOR_FINISH_ACKNOWLEDGE; 1190 else 1191 return B_OK; 1192 1193 T(State(this)); 1194 1195 status_t status = _SendQueued(); 1196 if (status != B_OK) { 1197 fState = previousState; 1198 T(State(this)); 1199 return status; 1200 } 1201 1202 return B_OK; 1203 } 1204 1205 1206 void 1207 TCPEndpoint::_MarkEstablished() 1208 { 1209 fState = ESTABLISHED; 1210 T(State(this)); 1211 1212 if (gSocketModule->has_parent(socket)) { 1213 gSocketModule->set_connected(socket); 1214 release_sem_etc(fAcceptSemaphore, 1, B_DO_NOT_RESCHEDULE); 1215 } 1216 1217 fSendCondition.NotifyAll(); 1218 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 1219 } 1220 1221 1222 status_t 1223 TCPEndpoint::_WaitForEstablished(MutexLocker &locker, bigtime_t timeout) 1224 { 1225 // TODO: Checking for CLOSED seems correct, but breaks several neon tests. 1226 // When investigating this, also have a look at _Close() and _HandleReset(). 1227 while (fState < ESTABLISHED/* && fState != CLOSED*/) { 1228 if (socket->error != B_OK) 1229 return socket->error; 1230 1231 status_t status = _WaitForCondition(fSendCondition, locker, timeout); 1232 if (status < B_OK) 1233 return status; 1234 } 1235 1236 return B_OK; 1237 } 1238 1239 1240 // #pragma mark - receive 1241 1242 1243 void 1244 TCPEndpoint::_Close() 1245 { 1246 _CancelConnectionTimers(); 1247 fState = CLOSED; 1248 T(State(this)); 1249 1250 fFlags |= FLAG_DELETE_ON_CLOSE; 1251 1252 fSendCondition.NotifyAll(); 1253 _NotifyReader(); 1254 1255 if (gSocketModule->has_parent(socket)) { 1256 // We still have a parent - obviously, we haven't been accepted yet, 1257 // so no one could ever close us. 1258 _CancelConnectionTimers(); 1259 gSocketModule->set_aborted(socket); 1260 } 1261 } 1262 1263 1264 void 1265 TCPEndpoint::_HandleReset(status_t error) 1266 { 1267 socket->error = error; 1268 _Close(); 1269 1270 gSocketModule->notify(socket, B_SELECT_WRITE, error); 1271 gSocketModule->notify(socket, B_SELECT_ERROR, error); 1272 } 1273 1274 1275 void 1276 TCPEndpoint::_DuplicateAcknowledge(tcp_segment_header &segment) 1277 { 1278 if (++fDuplicateAcknowledgeCount < 3) 1279 return; 1280 1281 if (fDuplicateAcknowledgeCount == 3) { 1282 _ResetSlowStart(); 1283 fCongestionWindow = fSlowStartThreshold + 3 * fSendMaxSegmentSize; 1284 fSendNext = segment.acknowledge; 1285 } else if (fDuplicateAcknowledgeCount > 3) 1286 fCongestionWindow += fSendMaxSegmentSize; 1287 1288 _SendQueued(); 1289 } 1290 1291 1292 void 1293 TCPEndpoint::_UpdateTimestamps(tcp_segment_header& segment, 1294 size_t segmentLength) 1295 { 1296 if (fFlags & FLAG_OPTION_TIMESTAMP) { 1297 tcp_sequence sequence(segment.sequence); 1298 1299 if (fLastAcknowledgeSent >= sequence 1300 && fLastAcknowledgeSent < (sequence + segmentLength)) 1301 fReceivedTimestamp = segment.timestamp_value; 1302 } 1303 } 1304 1305 1306 ssize_t 1307 TCPEndpoint::_AvailableData() const 1308 { 1309 // TODO: Refer to the FLAG_NO_RECEIVE comment above regarding 1310 // the application of FLAG_NO_RECEIVE in listen()ing 1311 // sockets. 1312 if (fState == LISTEN) 1313 return gSocketModule->count_connected(socket); 1314 if (fState == SYNCHRONIZE_SENT) 1315 return 0; 1316 1317 ssize_t availableData = fReceiveQueue.Available(); 1318 1319 if (availableData == 0 && !_ShouldReceive()) 1320 return ENOTCONN; 1321 1322 return availableData; 1323 } 1324 1325 1326 void 1327 TCPEndpoint::_NotifyReader() 1328 { 1329 fReceiveCondition.NotifyAll(); 1330 gSocketModule->notify(socket, B_SELECT_READ, _AvailableData()); 1331 } 1332 1333 1334 bool 1335 TCPEndpoint::_AddData(tcp_segment_header& segment, net_buffer* buffer) 1336 { 1337 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1338 // Remember the position of the finish received flag 1339 fFinishReceived = true; 1340 fFinishReceivedAt = segment.sequence + buffer->size; 1341 } 1342 1343 fReceiveQueue.Add(buffer, segment.sequence); 1344 fReceiveNext = fReceiveQueue.NextSequence(); 1345 1346 if (fFinishReceived) { 1347 // Set or reset the finish flag on the current segment 1348 if (fReceiveNext < fFinishReceivedAt) 1349 segment.flags &= ~TCP_FLAG_FINISH; 1350 else 1351 segment.flags |= TCP_FLAG_FINISH; 1352 } 1353 1354 TRACE(" _AddData(): adding data, receive next = %" B_PRIu32 ". Now have %" 1355 B_PRIuSIZE " bytes.", fReceiveNext.Number(), fReceiveQueue.Available()); 1356 1357 if ((segment.flags & TCP_FLAG_PUSH) != 0) 1358 fReceiveQueue.SetPushPointer(); 1359 1360 return fReceiveQueue.Available() > 0; 1361 } 1362 1363 1364 void 1365 TCPEndpoint::_PrepareReceivePath(tcp_segment_header& segment) 1366 { 1367 fInitialReceiveSequence = segment.sequence; 1368 fFinishReceived = false; 1369 1370 // count the received SYN 1371 segment.sequence++; 1372 1373 fReceiveNext = segment.sequence; 1374 fReceiveQueue.SetInitialSequence(segment.sequence); 1375 1376 if ((fOptions & TCP_NOOPT) == 0) { 1377 if (segment.max_segment_size > 0) 1378 fSendMaxSegmentSize = segment.max_segment_size; 1379 1380 if (segment.options & TCP_HAS_WINDOW_SCALE) { 1381 fFlags |= FLAG_OPTION_WINDOW_SCALE; 1382 fSendWindowShift = segment.window_shift; 1383 } else { 1384 fFlags &= ~FLAG_OPTION_WINDOW_SCALE; 1385 fReceiveWindowShift = 0; 1386 } 1387 1388 if (segment.options & TCP_HAS_TIMESTAMPS) { 1389 fFlags |= FLAG_OPTION_TIMESTAMP; 1390 fReceivedTimestamp = segment.timestamp_value; 1391 } else 1392 fFlags &= ~FLAG_OPTION_TIMESTAMP; 1393 } 1394 1395 fCongestionWindow = 2 * fSendMaxSegmentSize; 1396 fSlowStartThreshold = (uint32)segment.advertised_window << fSendWindowShift; 1397 } 1398 1399 1400 bool 1401 TCPEndpoint::_ShouldReceive() const 1402 { 1403 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1404 return false; 1405 1406 return fState == ESTABLISHED || fState == FINISH_SENT 1407 || fState == FINISH_ACKNOWLEDGED; 1408 } 1409 1410 1411 int32 1412 TCPEndpoint::_Spawn(TCPEndpoint* parent, tcp_segment_header& segment, 1413 net_buffer* buffer) 1414 { 1415 MutexLocker _(fLock); 1416 1417 // TODO error checking 1418 ProtocolSocket::Open(); 1419 1420 fState = SYNCHRONIZE_RECEIVED; 1421 T(Spawn(parent, this)); 1422 1423 fManager = parent->fManager; 1424 1425 LocalAddress().SetTo(buffer->destination); 1426 PeerAddress().SetTo(buffer->source); 1427 1428 TRACE("Spawn()"); 1429 1430 // TODO: proper error handling! 1431 if (fManager->BindChild(this) != B_OK) { 1432 T(Error(this, "binding failed", __LINE__)); 1433 return DROP; 1434 } 1435 if (_PrepareSendPath(*PeerAddress()) != B_OK) { 1436 T(Error(this, "prepare send faild", __LINE__)); 1437 return DROP; 1438 } 1439 1440 fOptions = parent->fOptions; 1441 fAcceptSemaphore = parent->fAcceptSemaphore; 1442 1443 _PrepareReceivePath(segment); 1444 1445 // send SYN+ACK 1446 if (_SendQueued() != B_OK) { 1447 T(Error(this, "sending failed", __LINE__)); 1448 return DROP; 1449 } 1450 1451 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1452 // we handled this flag now, it must not be set for further processing 1453 1454 return _Receive(segment, buffer); 1455 } 1456 1457 1458 int32 1459 TCPEndpoint::_ListenReceive(tcp_segment_header& segment, net_buffer* buffer) 1460 { 1461 TRACE("ListenReceive()"); 1462 1463 // Essentially, we accept only TCP_FLAG_SYNCHRONIZE in this state, 1464 // but the error behaviour differs 1465 if (segment.flags & TCP_FLAG_RESET) 1466 return DROP; 1467 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 1468 return DROP | RESET; 1469 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1470 return DROP; 1471 1472 // TODO: drop broadcast/multicast 1473 1474 // spawn new endpoint for accept() 1475 net_socket* newSocket; 1476 if (gSocketModule->spawn_pending_socket(socket, &newSocket) < B_OK) { 1477 T(Error(this, "spawning failed", __LINE__)); 1478 return DROP; 1479 } 1480 1481 return ((TCPEndpoint *)newSocket->first_protocol)->_Spawn(this, 1482 segment, buffer); 1483 } 1484 1485 1486 int32 1487 TCPEndpoint::_SynchronizeSentReceive(tcp_segment_header &segment, 1488 net_buffer *buffer) 1489 { 1490 TRACE("_SynchronizeSentReceive()"); 1491 1492 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1493 && (fInitialSendSequence >= segment.acknowledge 1494 || fSendMax < segment.acknowledge)) 1495 return DROP | RESET; 1496 1497 if (segment.flags & TCP_FLAG_RESET) { 1498 _HandleReset(ECONNREFUSED); 1499 return DROP; 1500 } 1501 1502 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) == 0) 1503 return DROP; 1504 1505 fSendUnacknowledged = segment.acknowledge; 1506 _PrepareReceivePath(segment); 1507 1508 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) { 1509 _MarkEstablished(); 1510 } else { 1511 // simultaneous open 1512 fState = SYNCHRONIZE_RECEIVED; 1513 T(State(this)); 1514 } 1515 1516 segment.flags &= ~TCP_FLAG_SYNCHRONIZE; 1517 // we handled this flag now, it must not be set for further processing 1518 1519 return _Receive(segment, buffer) | IMMEDIATE_ACKNOWLEDGE; 1520 } 1521 1522 1523 int32 1524 TCPEndpoint::_Receive(tcp_segment_header& segment, net_buffer* buffer) 1525 { 1526 uint32 advertisedWindow = (uint32)segment.advertised_window 1527 << fSendWindowShift; 1528 size_t segmentLength = buffer->size; 1529 1530 // First, handle the most common case for uni-directional data transfer 1531 // (known as header prediction - the segment must not change the window, 1532 // and must be the expected sequence, and contain no control flags) 1533 1534 if (fState == ESTABLISHED 1535 && segment.AcknowledgeOnly() 1536 && fReceiveNext == segment.sequence 1537 && advertisedWindow > 0 && advertisedWindow == fSendWindow 1538 && fSendNext == fSendMax) { 1539 _UpdateTimestamps(segment, segmentLength); 1540 1541 if (segmentLength == 0) { 1542 // this is a pure acknowledge segment - we're on the sending end 1543 if (fSendUnacknowledged < segment.acknowledge 1544 && fSendMax >= segment.acknowledge) { 1545 _Acknowledged(segment); 1546 return DROP; 1547 } 1548 } else if (segment.acknowledge == fSendUnacknowledged 1549 && fReceiveQueue.IsContiguous() 1550 && fReceiveQueue.Free() >= segmentLength 1551 && (fFlags & FLAG_NO_RECEIVE) == 0) { 1552 if (_AddData(segment, buffer)) 1553 _NotifyReader(); 1554 1555 return KEEP | ((segment.flags & TCP_FLAG_PUSH) != 0 1556 ? IMMEDIATE_ACKNOWLEDGE : ACKNOWLEDGE); 1557 } 1558 } 1559 1560 // The fast path was not applicable, so we continue with the standard 1561 // processing of the incoming segment 1562 1563 ASSERT(fState != SYNCHRONIZE_SENT && fState != LISTEN); 1564 1565 if (fState != CLOSED && fState != TIME_WAIT) { 1566 // Check sequence number 1567 if (!segment_in_sequence(segment, segmentLength, fReceiveNext, 1568 fReceiveWindow)) { 1569 TRACE(" Receive(): segment out of window, next: %" B_PRIu32 1570 " wnd: %" B_PRIu32, fReceiveNext.Number(), fReceiveWindow); 1571 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1572 // TODO: this doesn't look right - review! 1573 return DROP; 1574 } 1575 return DROP | IMMEDIATE_ACKNOWLEDGE; 1576 } 1577 } 1578 1579 if ((segment.flags & TCP_FLAG_RESET) != 0) { 1580 // Is this a valid reset? 1581 // We generally ignore resets in time wait state (see RFC 1337) 1582 if (fLastAcknowledgeSent <= segment.sequence 1583 && tcp_sequence(segment.sequence) < (fLastAcknowledgeSent 1584 + fReceiveWindow) 1585 && fState != TIME_WAIT) { 1586 status_t error; 1587 if (fState == SYNCHRONIZE_RECEIVED) 1588 error = ECONNREFUSED; 1589 else if (fState == CLOSING || fState == WAIT_FOR_FINISH_ACKNOWLEDGE) 1590 error = ENOTCONN; 1591 else 1592 error = ECONNRESET; 1593 1594 _HandleReset(error); 1595 } 1596 1597 return DROP; 1598 } 1599 1600 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1601 || (fState == SYNCHRONIZE_RECEIVED 1602 && (fInitialReceiveSequence > segment.sequence 1603 || ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0 1604 && (fSendUnacknowledged > segment.acknowledge 1605 || fSendMax < segment.acknowledge))))) { 1606 // reset the connection - either the initial SYN was faulty, or we 1607 // received a SYN within the data stream 1608 return DROP | RESET; 1609 } 1610 1611 // TODO: Check this! Why do we advertize a window outside of what we should 1612 // buffer? 1613 fReceiveWindow = max_c(fReceiveQueue.Free(), fReceiveWindow); 1614 // the window must not shrink 1615 1616 // trim buffer to be within the receive window 1617 int32 drop = (int32)(fReceiveNext - segment.sequence).Number(); 1618 if (drop > 0) { 1619 if ((uint32)drop > buffer->size 1620 || ((uint32)drop == buffer->size 1621 && (segment.flags & TCP_FLAG_FINISH) == 0)) { 1622 // don't accidently remove a FIN we shouldn't remove 1623 segment.flags &= ~TCP_FLAG_FINISH; 1624 drop = buffer->size; 1625 } 1626 1627 // remove duplicate data at the start 1628 TRACE("* remove %" B_PRId32 " bytes from the start", drop); 1629 gBufferModule->remove_header(buffer, drop); 1630 segment.sequence += drop; 1631 } 1632 1633 int32 action = KEEP; 1634 1635 drop = (int32)(segment.sequence + buffer->size 1636 - (fReceiveNext + fReceiveWindow)).Number(); 1637 if (drop > 0) { 1638 // remove data exceeding our window 1639 if ((uint32)drop >= buffer->size) { 1640 // if we can accept data, or the segment is not what we'd expect, 1641 // drop the segment (an immediate acknowledge is always triggered) 1642 if (fReceiveWindow != 0 || segment.sequence != fReceiveNext) 1643 return DROP | IMMEDIATE_ACKNOWLEDGE; 1644 1645 action |= IMMEDIATE_ACKNOWLEDGE; 1646 } 1647 1648 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1649 // we need to remove the finish, too, as part of the data 1650 drop--; 1651 } 1652 1653 segment.flags &= ~(TCP_FLAG_FINISH | TCP_FLAG_PUSH); 1654 TRACE("* remove %" B_PRId32 " bytes from the end", drop); 1655 gBufferModule->remove_trailer(buffer, drop); 1656 } 1657 1658 #ifdef TRACE_TCP 1659 if (advertisedWindow > fSendWindow) { 1660 TRACE(" Receive(): Window update %" B_PRIu32 " -> %" B_PRIu32, 1661 fSendWindow, advertisedWindow); 1662 } 1663 #endif 1664 1665 fSendWindow = advertisedWindow; 1666 if (advertisedWindow > fSendMaxWindow) 1667 fSendMaxWindow = advertisedWindow; 1668 1669 // Then look at the acknowledgement for any updates 1670 1671 if ((segment.flags & TCP_FLAG_ACKNOWLEDGE) != 0) { 1672 // process acknowledged data 1673 if (fState == SYNCHRONIZE_RECEIVED) 1674 _MarkEstablished(); 1675 1676 if (fSendMax < segment.acknowledge) 1677 return DROP | IMMEDIATE_ACKNOWLEDGE; 1678 1679 if (segment.acknowledge < fSendUnacknowledged) { 1680 if (buffer->size == 0 && advertisedWindow == fSendWindow 1681 && (segment.flags & TCP_FLAG_FINISH) == 0) { 1682 TRACE("Receive(): duplicate ack!"); 1683 1684 _DuplicateAcknowledge(segment); 1685 } 1686 1687 return DROP; 1688 } else { 1689 // this segment acknowledges in flight data 1690 1691 if (fDuplicateAcknowledgeCount >= 3) { 1692 // deflate the window. 1693 fCongestionWindow = fSlowStartThreshold; 1694 } 1695 1696 fDuplicateAcknowledgeCount = 0; 1697 1698 if (fSendMax == segment.acknowledge) 1699 TRACE("Receive(): all inflight data ack'd!"); 1700 1701 if (segment.acknowledge > fSendQueue.LastSequence() 1702 && fState > ESTABLISHED) { 1703 TRACE("Receive(): FIN has been acknowledged!"); 1704 1705 switch (fState) { 1706 case FINISH_SENT: 1707 fState = FINISH_ACKNOWLEDGED; 1708 T(State(this)); 1709 break; 1710 case CLOSING: 1711 fState = TIME_WAIT; 1712 T(State(this)); 1713 _EnterTimeWait(); 1714 return DROP; 1715 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1716 _Close(); 1717 break; 1718 1719 default: 1720 break; 1721 } 1722 } 1723 1724 if (fState != CLOSED) 1725 _Acknowledged(segment); 1726 } 1727 } 1728 1729 if (segment.flags & TCP_FLAG_URGENT) { 1730 if (fState == ESTABLISHED || fState == FINISH_SENT 1731 || fState == FINISH_ACKNOWLEDGED) { 1732 // TODO: Handle urgent data: 1733 // - RCV.UP <- max(RCV.UP, SEG.UP) 1734 // - signal the user that urgent data is available (SIGURG) 1735 } 1736 } 1737 1738 bool notify = false; 1739 1740 // The buffer may be freed if its data is added to the queue, so cache 1741 // the size as we still need it later. 1742 uint32 bufferSize = buffer->size; 1743 1744 if ((bufferSize > 0 || (segment.flags & TCP_FLAG_FINISH) != 0) 1745 && _ShouldReceive()) 1746 notify = _AddData(segment, buffer); 1747 else { 1748 if ((fFlags & FLAG_NO_RECEIVE) != 0) 1749 fReceiveNext += buffer->size; 1750 1751 action = (action & ~KEEP) | DROP; 1752 } 1753 1754 if ((segment.flags & TCP_FLAG_FINISH) != 0) { 1755 segmentLength++; 1756 if (fState != CLOSED && fState != LISTEN && fState != SYNCHRONIZE_SENT) { 1757 TRACE("Receive(): peer is finishing connection!"); 1758 fReceiveNext++; 1759 notify = true; 1760 1761 // FIN implies PUSH 1762 fReceiveQueue.SetPushPointer(); 1763 1764 // we'll reply immediately to the FIN if we are not 1765 // transitioning to TIME WAIT so we immediatly ACK it. 1766 action |= IMMEDIATE_ACKNOWLEDGE; 1767 1768 // other side is closing connection; change states 1769 switch (fState) { 1770 case ESTABLISHED: 1771 case SYNCHRONIZE_RECEIVED: 1772 fState = FINISH_RECEIVED; 1773 T(State(this)); 1774 break; 1775 case FINISH_SENT: 1776 // simultaneous close 1777 fState = CLOSING; 1778 T(State(this)); 1779 break; 1780 case FINISH_ACKNOWLEDGED: 1781 fState = TIME_WAIT; 1782 T(State(this)); 1783 _EnterTimeWait(); 1784 break; 1785 case TIME_WAIT: 1786 _UpdateTimeWait(); 1787 break; 1788 1789 default: 1790 break; 1791 } 1792 } 1793 } 1794 1795 if (notify) 1796 _NotifyReader(); 1797 1798 if (bufferSize > 0 || (segment.flags & TCP_FLAG_SYNCHRONIZE) != 0) 1799 action |= ACKNOWLEDGE; 1800 1801 _UpdateTimestamps(segment, segmentLength); 1802 1803 TRACE("Receive() Action %" B_PRId32, action); 1804 1805 return action; 1806 } 1807 1808 1809 int32 1810 TCPEndpoint::SegmentReceived(tcp_segment_header& segment, net_buffer* buffer) 1811 { 1812 MutexLocker locker(fLock); 1813 1814 TRACE("SegmentReceived(): buffer %p (%" B_PRIu32 " bytes) address %s " 1815 "to %s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 1816 ", wnd %" B_PRIu32, buffer, buffer->size, PrintAddress(buffer->source), 1817 PrintAddress(buffer->destination), segment.flags, segment.sequence, 1818 segment.acknowledge, 1819 (uint32)segment.advertised_window << fSendWindowShift); 1820 T(Receive(this, segment, 1821 (uint32)segment.advertised_window << fSendWindowShift, buffer)); 1822 int32 segmentAction = DROP; 1823 1824 switch (fState) { 1825 case LISTEN: 1826 segmentAction = _ListenReceive(segment, buffer); 1827 break; 1828 1829 case SYNCHRONIZE_SENT: 1830 segmentAction = _SynchronizeSentReceive(segment, buffer); 1831 break; 1832 1833 case SYNCHRONIZE_RECEIVED: 1834 case ESTABLISHED: 1835 case FINISH_RECEIVED: 1836 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1837 case FINISH_SENT: 1838 case FINISH_ACKNOWLEDGED: 1839 case CLOSING: 1840 case TIME_WAIT: 1841 case CLOSED: 1842 segmentAction = _Receive(segment, buffer); 1843 break; 1844 } 1845 1846 // process acknowledge action as asked for by the *Receive() method 1847 if (segmentAction & IMMEDIATE_ACKNOWLEDGE) 1848 SendAcknowledge(true); 1849 else if (segmentAction & ACKNOWLEDGE) 1850 DelayedAcknowledge(); 1851 1852 if ((fFlags & (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) 1853 == (FLAG_CLOSED | FLAG_DELETE_ON_CLOSE)) { 1854 locker.Unlock(); 1855 gSocketModule->release_socket(socket); 1856 } 1857 1858 return segmentAction; 1859 } 1860 1861 1862 // #pragma mark - send 1863 1864 1865 inline uint8 1866 TCPEndpoint::_CurrentFlags() 1867 { 1868 // we don't set FLAG_FINISH here, instead we do it 1869 // conditionally below depending if we are sending 1870 // the last bytes of the send queue. 1871 1872 switch (fState) { 1873 case CLOSED: 1874 return TCP_FLAG_RESET | TCP_FLAG_ACKNOWLEDGE; 1875 1876 case SYNCHRONIZE_SENT: 1877 return TCP_FLAG_SYNCHRONIZE; 1878 case SYNCHRONIZE_RECEIVED: 1879 return TCP_FLAG_SYNCHRONIZE | TCP_FLAG_ACKNOWLEDGE; 1880 1881 case ESTABLISHED: 1882 case FINISH_RECEIVED: 1883 case FINISH_ACKNOWLEDGED: 1884 case TIME_WAIT: 1885 case WAIT_FOR_FINISH_ACKNOWLEDGE: 1886 case FINISH_SENT: 1887 case CLOSING: 1888 return TCP_FLAG_ACKNOWLEDGE; 1889 1890 default: 1891 return 0; 1892 } 1893 } 1894 1895 1896 inline bool 1897 TCPEndpoint::_ShouldSendSegment(tcp_segment_header& segment, uint32 length, 1898 uint32 segmentMaxSize, uint32 flightSize) 1899 { 1900 if (length > 0) { 1901 // Avoid the silly window syndrome - we only send a segment in case: 1902 // - we have a full segment to send, or 1903 // - we're at the end of our buffer queue, or 1904 // - the buffer is at least larger than half of the maximum send window, 1905 // or 1906 // - we're retransmitting data 1907 if (length == segmentMaxSize 1908 || (fOptions & TCP_NODELAY) != 0 1909 || tcp_sequence(fSendNext + length) == fSendQueue.LastSequence() 1910 || (fSendMaxWindow > 0 && length >= fSendMaxWindow / 2)) 1911 return true; 1912 } 1913 1914 // check if we need to send a window update to the peer 1915 if (segment.advertised_window > 0) { 1916 // correct the window to take into account what already has been advertised 1917 uint32 window = (segment.advertised_window << fReceiveWindowShift) 1918 - (fReceiveMaxAdvertised - fReceiveNext).Number(); 1919 1920 // if we can advertise a window larger than twice the maximum segment 1921 // size, or half the maximum buffer size we send a window update 1922 if (window >= (fReceiveMaxSegmentSize << 1) 1923 || window >= (socket->receive.buffer_size >> 1)) 1924 return true; 1925 } 1926 1927 if ((segment.flags & (TCP_FLAG_SYNCHRONIZE | TCP_FLAG_FINISH 1928 | TCP_FLAG_RESET)) != 0) 1929 return true; 1930 1931 // We do have urgent data pending 1932 if (fSendUrgentOffset > fSendNext) 1933 return true; 1934 1935 // there is no reason to send a segment just now 1936 return false; 1937 } 1938 1939 1940 status_t 1941 TCPEndpoint::_SendQueued(bool force) 1942 { 1943 return _SendQueued(force, fSendWindow); 1944 } 1945 1946 1947 /*! Sends one or more TCP segments with the data waiting in the queue, or some 1948 specific flags that need to be sent. 1949 */ 1950 status_t 1951 TCPEndpoint::_SendQueued(bool force, uint32 sendWindow) 1952 { 1953 if (fRoute == NULL) 1954 return B_ERROR; 1955 1956 // in passive state? 1957 if (fState == LISTEN) 1958 return B_ERROR; 1959 1960 tcp_segment_header segment(_CurrentFlags()); 1961 1962 if ((fOptions & TCP_NOOPT) == 0) { 1963 if ((fFlags & FLAG_OPTION_TIMESTAMP) != 0) { 1964 segment.options |= TCP_HAS_TIMESTAMPS; 1965 segment.timestamp_reply = fReceivedTimestamp; 1966 segment.timestamp_value = tcp_now(); 1967 } 1968 1969 if ((segment.flags & TCP_FLAG_SYNCHRONIZE) != 0 1970 && fSendNext == fInitialSendSequence) { 1971 // add connection establishment options 1972 segment.max_segment_size = fReceiveMaxSegmentSize; 1973 if (fFlags & FLAG_OPTION_WINDOW_SCALE) { 1974 segment.options |= TCP_HAS_WINDOW_SCALE; 1975 segment.window_shift = fReceiveWindowShift; 1976 } 1977 } 1978 } 1979 1980 size_t availableBytes = fReceiveQueue.Free(); 1981 if (fFlags & FLAG_OPTION_WINDOW_SCALE) 1982 segment.advertised_window = availableBytes >> fReceiveWindowShift; 1983 else 1984 segment.advertised_window = min_c(TCP_MAX_WINDOW, availableBytes); 1985 1986 segment.acknowledge = fReceiveNext.Number(); 1987 1988 // Process urgent data 1989 if (fSendUrgentOffset > fSendNext) { 1990 segment.flags |= TCP_FLAG_URGENT; 1991 segment.urgent_offset = (fSendUrgentOffset - fSendNext).Number(); 1992 } else { 1993 fSendUrgentOffset = fSendUnacknowledged.Number(); 1994 // Keep urgent offset updated, so that it doesn't reach into our 1995 // send window on overlap 1996 segment.urgent_offset = 0; 1997 } 1998 1999 if (fCongestionWindow > 0 && fCongestionWindow < sendWindow) 2000 sendWindow = fCongestionWindow; 2001 2002 // fSendUnacknowledged 2003 // | fSendNext fSendMax 2004 // | | | 2005 // v v v 2006 // ----------------------------------- 2007 // | effective window | 2008 // ----------------------------------- 2009 2010 // Flight size represents the window of data which is currently in the 2011 // ether. We should never send data such as the flight size becomes larger 2012 // than the effective window. Note however that the effective window may be 2013 // reduced (by congestion for instance), so at some point in time flight 2014 // size may be larger than the currently calculated window. 2015 2016 uint32 flightSize = (fSendMax - fSendUnacknowledged).Number(); 2017 uint32 consumedWindow = (fSendNext - fSendUnacknowledged).Number(); 2018 2019 if (consumedWindow > sendWindow) { 2020 sendWindow = 0; 2021 // TODO: enter persist state? try to get a window update. 2022 } else 2023 sendWindow -= consumedWindow; 2024 2025 if (force && sendWindow == 0 && fSendNext <= fSendQueue.LastSequence()) { 2026 // send one byte of data to ask for a window update 2027 // (triggered by the persist timer) 2028 sendWindow = 1; 2029 } 2030 2031 uint32 length = min_c(fSendQueue.Available(fSendNext), sendWindow); 2032 bool shouldStartRetransmitTimer = fSendNext == fSendUnacknowledged; 2033 bool retransmit = fSendNext < fSendMax; 2034 2035 do { 2036 uint32 segmentMaxSize = fSendMaxSegmentSize 2037 - tcp_options_length(segment); 2038 uint32 segmentLength = min_c(length, segmentMaxSize); 2039 2040 if (fSendNext + segmentLength == fSendQueue.LastSequence()) { 2041 if (state_needs_finish(fState)) 2042 segment.flags |= TCP_FLAG_FINISH; 2043 if (length > 0) 2044 segment.flags |= TCP_FLAG_PUSH; 2045 } 2046 2047 // Determine if we should really send this segment 2048 if (!force && !retransmit && !_ShouldSendSegment(segment, segmentLength, 2049 segmentMaxSize, flightSize)) { 2050 if (fSendQueue.Available() 2051 && !gStackModule->is_timer_active(&fPersistTimer) 2052 && !gStackModule->is_timer_active(&fRetransmitTimer)) 2053 _StartPersistTimer(); 2054 break; 2055 } 2056 2057 net_buffer *buffer = gBufferModule->create(256); 2058 if (buffer == NULL) 2059 return B_NO_MEMORY; 2060 2061 status_t status = B_OK; 2062 if (segmentLength > 0) 2063 status = fSendQueue.Get(buffer, fSendNext, segmentLength); 2064 if (status < B_OK) { 2065 gBufferModule->free(buffer); 2066 return status; 2067 } 2068 2069 LocalAddress().CopyTo(buffer->source); 2070 PeerAddress().CopyTo(buffer->destination); 2071 2072 uint32 size = buffer->size; 2073 segment.sequence = fSendNext.Number(); 2074 2075 TRACE("SendQueued(): buffer %p (%" B_PRIu32 " bytes) address %s to " 2076 "%s flags %#" B_PRIx8 ", seq %" B_PRIu32 ", ack %" B_PRIu32 2077 ", rwnd %" B_PRIu16 ", cwnd %" B_PRIu32 ", ssthresh %" B_PRIu32 2078 ", len %" B_PRIu32 ", first %" B_PRIu32 ", last %" B_PRIu32, 2079 buffer, buffer->size, PrintAddress(buffer->source), 2080 PrintAddress(buffer->destination), segment.flags, segment.sequence, 2081 segment.acknowledge, segment.advertised_window, 2082 fCongestionWindow, fSlowStartThreshold, segmentLength, 2083 fSendQueue.FirstSequence().Number(), 2084 fSendQueue.LastSequence().Number()); 2085 T(Send(this, segment, buffer, fSendQueue.FirstSequence(), 2086 fSendQueue.LastSequence())); 2087 2088 PROBE(buffer, sendWindow); 2089 sendWindow -= buffer->size; 2090 2091 status = add_tcp_header(AddressModule(), segment, buffer); 2092 if (status != B_OK) { 2093 gBufferModule->free(buffer); 2094 return status; 2095 } 2096 2097 // Update send status - we need to do this before we send the data 2098 // for local connections as the answer is directly handled 2099 2100 if (segment.flags & TCP_FLAG_SYNCHRONIZE) { 2101 segment.options &= ~TCP_HAS_WINDOW_SCALE; 2102 segment.max_segment_size = 0; 2103 size++; 2104 } 2105 2106 if (segment.flags & TCP_FLAG_FINISH) 2107 size++; 2108 2109 uint32 sendMax = fSendMax.Number(); 2110 fSendNext += size; 2111 if (fSendMax < fSendNext) 2112 fSendMax = fSendNext; 2113 2114 fReceiveMaxAdvertised = fReceiveNext 2115 + ((uint32)segment.advertised_window << fReceiveWindowShift); 2116 2117 status = next->module->send_routed_data(next, fRoute, buffer); 2118 if (status < B_OK) { 2119 gBufferModule->free(buffer); 2120 2121 fSendNext = segment.sequence; 2122 fSendMax = sendMax; 2123 // restore send status 2124 return status; 2125 } 2126 2127 if (shouldStartRetransmitTimer && size > 0) { 2128 TRACE("starting initial retransmit timer of: %" B_PRIdBIGTIME, 2129 fRetransmitTimeout); 2130 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2131 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2132 shouldStartRetransmitTimer = false; 2133 } 2134 2135 if (segment.flags & TCP_FLAG_ACKNOWLEDGE) 2136 fLastAcknowledgeSent = segment.acknowledge; 2137 2138 length -= segmentLength; 2139 segment.flags &= ~(TCP_FLAG_SYNCHRONIZE | TCP_FLAG_RESET 2140 | TCP_FLAG_FINISH); 2141 2142 if (retransmit) 2143 break; 2144 2145 } while (length > 0); 2146 2147 return B_OK; 2148 } 2149 2150 2151 int 2152 TCPEndpoint::_MaxSegmentSize(const sockaddr* address) const 2153 { 2154 return next->module->get_mtu(next, address) - sizeof(tcp_header); 2155 } 2156 2157 2158 status_t 2159 TCPEndpoint::_PrepareSendPath(const sockaddr* peer) 2160 { 2161 if (fRoute == NULL) { 2162 fRoute = gDatalinkModule->get_route(Domain(), peer); 2163 if (fRoute == NULL) 2164 return ENETUNREACH; 2165 2166 if ((fRoute->flags & RTF_LOCAL) != 0) 2167 fFlags |= FLAG_LOCAL; 2168 } 2169 2170 // make sure connection does not already exist 2171 status_t status = fManager->SetConnection(this, *LocalAddress(), peer, 2172 fRoute->interface_address->local); 2173 if (status < B_OK) 2174 return status; 2175 2176 fInitialSendSequence = system_time() >> 4; 2177 fSendNext = fInitialSendSequence; 2178 fSendUnacknowledged = fInitialSendSequence; 2179 fSendMax = fInitialSendSequence; 2180 fSendUrgentOffset = fInitialSendSequence; 2181 2182 // we are counting the SYN here 2183 fSendQueue.SetInitialSequence(fSendNext + 1); 2184 2185 fReceiveMaxSegmentSize = _MaxSegmentSize(peer); 2186 2187 // Compute the window shift we advertise to our peer - if it doesn't support 2188 // this option, this will be reset to 0 (when its SYN is received) 2189 fReceiveWindowShift = 0; 2190 while (fReceiveWindowShift < TCP_MAX_WINDOW_SHIFT 2191 && (0xffffUL << fReceiveWindowShift) < socket->receive.buffer_size) { 2192 fReceiveWindowShift++; 2193 } 2194 2195 return B_OK; 2196 } 2197 2198 2199 void 2200 TCPEndpoint::_Acknowledged(tcp_segment_header& segment) 2201 { 2202 TRACE("_Acknowledged(): ack %" B_PRIu32 "; uack %" B_PRIu32 "; next %" 2203 B_PRIu32 "; max %" B_PRIu32, segment.acknowledge, 2204 fSendUnacknowledged.Number(), fSendNext.Number(), fSendMax.Number()); 2205 2206 ASSERT(fSendUnacknowledged <= segment.acknowledge); 2207 2208 if (fSendUnacknowledged < segment.acknowledge) { 2209 fSendQueue.RemoveUntil(segment.acknowledge); 2210 fSendUnacknowledged = segment.acknowledge; 2211 if (fSendNext < fSendUnacknowledged) 2212 fSendNext = fSendUnacknowledged; 2213 2214 if (segment.options & TCP_HAS_TIMESTAMPS) 2215 _UpdateRoundTripTime(tcp_diff_timestamp(segment.timestamp_reply)); 2216 else { 2217 // TODO: Fallback to RFC 793 type estimation; This just resets 2218 // any potential exponential back off that happened due to 2219 // retransmits. 2220 fRetransmitTimeout = TCP_INITIAL_RTT; 2221 } 2222 2223 if (fSendUnacknowledged == fSendMax) { 2224 TRACE("all acknowledged, cancelling retransmission timer"); 2225 gStackModule->cancel_timer(&fRetransmitTimer); 2226 T(TimerSet(this, "retransmit", -1)); 2227 } else { 2228 TRACE("data acknowledged, resetting retransmission timer to: %" 2229 B_PRIdBIGTIME, fRetransmitTimeout); 2230 gStackModule->set_timer(&fRetransmitTimer, fRetransmitTimeout); 2231 T(TimerSet(this, "retransmit", fRetransmitTimeout)); 2232 } 2233 2234 if (is_writable(fState)) { 2235 // notify threads waiting on the socket to become writable again 2236 fSendCondition.NotifyAll(); 2237 gSocketModule->notify(socket, B_SELECT_WRITE, fSendQueue.Free()); 2238 } 2239 2240 if (fCongestionWindow < fSlowStartThreshold) 2241 fCongestionWindow += fSendMaxSegmentSize; 2242 } 2243 2244 if (fCongestionWindow >= fSlowStartThreshold) { 2245 uint32 increment = fSendMaxSegmentSize * fSendMaxSegmentSize; 2246 2247 if (increment < fCongestionWindow) 2248 increment = 1; 2249 else 2250 increment /= fCongestionWindow; 2251 2252 fCongestionWindow += increment; 2253 } 2254 2255 // if there is data left to be sent, send it now 2256 if (fSendQueue.Used() > 0) 2257 _SendQueued(); 2258 } 2259 2260 2261 void 2262 TCPEndpoint::_Retransmit() 2263 { 2264 TRACE("Retransmit()"); 2265 2266 _ResetSlowStart(); 2267 fSendNext = fSendUnacknowledged; 2268 2269 // Do exponential back off of the retransmit timeout 2270 fRetransmitTimeout *= 2; 2271 if (fRetransmitTimeout > TCP_MAX_RETRANSMIT_TIMEOUT) 2272 fRetransmitTimeout = TCP_MAX_RETRANSMIT_TIMEOUT; 2273 2274 _SendQueued(); 2275 } 2276 2277 2278 void 2279 TCPEndpoint::_UpdateRoundTripTime(int32 roundTripTime) 2280 { 2281 int32 rtt = roundTripTime; 2282 2283 // "smooth" round trip time as per Van Jacobson 2284 rtt -= fRoundTripTime / 8; 2285 fRoundTripTime += rtt; 2286 if (rtt < 0) 2287 rtt = -rtt; 2288 rtt -= fRoundTripDeviation / 4; 2289 fRoundTripDeviation += rtt; 2290 2291 fRetransmitTimeout = ((fRoundTripTime / 4 + fRoundTripDeviation) / 2) 2292 * kTimestampFactor; 2293 if (fRetransmitTimeout < TCP_MIN_RETRANSMIT_TIMEOUT) 2294 fRetransmitTimeout = TCP_MIN_RETRANSMIT_TIMEOUT; 2295 2296 TRACE(" RTO is now %" B_PRIdBIGTIME " (after rtt %" B_PRId32 "ms)", 2297 fRetransmitTimeout, roundTripTime); 2298 } 2299 2300 2301 void 2302 TCPEndpoint::_ResetSlowStart() 2303 { 2304 fSlowStartThreshold = max_c((fSendMax - fSendUnacknowledged).Number() / 2, 2305 2 * fSendMaxSegmentSize); 2306 fCongestionWindow = fSendMaxSegmentSize; 2307 } 2308 2309 2310 // #pragma mark - timer 2311 2312 2313 /*static*/ void 2314 TCPEndpoint::_RetransmitTimer(net_timer* timer, void* _endpoint) 2315 { 2316 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2317 T(TimerTriggered(endpoint, "retransmit")); 2318 2319 MutexLocker locker(endpoint->fLock); 2320 if (!locker.IsLocked()) 2321 return; 2322 2323 endpoint->_Retransmit(); 2324 } 2325 2326 2327 /*static*/ void 2328 TCPEndpoint::_PersistTimer(net_timer* timer, void* _endpoint) 2329 { 2330 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2331 T(TimerTriggered(endpoint, "persist")); 2332 2333 MutexLocker locker(endpoint->fLock); 2334 if (!locker.IsLocked()) 2335 return; 2336 2337 // the timer might not have been canceled early enough 2338 if (endpoint->State() == CLOSED) 2339 return; 2340 2341 endpoint->_SendQueued(true); 2342 } 2343 2344 2345 /*static*/ void 2346 TCPEndpoint::_DelayedAcknowledgeTimer(net_timer* timer, void* _endpoint) 2347 { 2348 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2349 T(TimerTriggered(endpoint, "delayed ack")); 2350 2351 MutexLocker locker(endpoint->fLock); 2352 if (!locker.IsLocked()) 2353 return; 2354 2355 // the timer might not have been canceled early enough 2356 if (endpoint->State() == CLOSED) 2357 return; 2358 2359 endpoint->SendAcknowledge(true); 2360 } 2361 2362 2363 /*static*/ void 2364 TCPEndpoint::_TimeWaitTimer(net_timer* timer, void* _endpoint) 2365 { 2366 TCPEndpoint* endpoint = (TCPEndpoint*)_endpoint; 2367 T(TimerTriggered(endpoint, "time-wait")); 2368 2369 MutexLocker locker(endpoint->fLock); 2370 if (!locker.IsLocked()) 2371 return; 2372 2373 if ((endpoint->fFlags & FLAG_CLOSED) == 0) { 2374 endpoint->fFlags |= FLAG_DELETE_ON_CLOSE; 2375 return; 2376 } 2377 2378 locker.Unlock(); 2379 2380 gSocketModule->release_socket(endpoint->socket); 2381 } 2382 2383 2384 /*static*/ status_t 2385 TCPEndpoint::_WaitForCondition(ConditionVariable& condition, 2386 MutexLocker& locker, bigtime_t timeout) 2387 { 2388 ConditionVariableEntry entry; 2389 condition.Add(&entry); 2390 2391 locker.Unlock(); 2392 status_t result = entry.Wait(B_ABSOLUTE_TIMEOUT | B_CAN_INTERRUPT, timeout); 2393 locker.Lock(); 2394 2395 return result; 2396 } 2397 2398 2399 // #pragma mark - 2400 2401 2402 void 2403 TCPEndpoint::Dump() const 2404 { 2405 kprintf("TCP endpoint %p\n", this); 2406 kprintf(" state: %s\n", name_for_state(fState)); 2407 kprintf(" flags: 0x%" B_PRIx32 "\n", fFlags); 2408 #if KDEBUG 2409 kprintf(" lock: { %p, holder: %" B_PRId32 " }\n", &fLock, fLock.holder); 2410 #endif 2411 kprintf(" accept sem: %" B_PRId32 "\n", fAcceptSemaphore); 2412 kprintf(" options: 0x%" B_PRIx32 "\n", (uint32)fOptions); 2413 kprintf(" send\n"); 2414 kprintf(" window shift: %" B_PRIu8 "\n", fSendWindowShift); 2415 kprintf(" unacknowledged: %" B_PRIu32 "\n", 2416 fSendUnacknowledged.Number()); 2417 kprintf(" next: %" B_PRIu32 "\n", fSendNext.Number()); 2418 kprintf(" max: %" B_PRIu32 "\n", fSendMax.Number()); 2419 kprintf(" urgent offset: %" B_PRIu32 "\n", fSendUrgentOffset.Number()); 2420 kprintf(" window: %" B_PRIu32 "\n", fSendWindow); 2421 kprintf(" max window: %" B_PRIu32 "\n", fSendMaxWindow); 2422 kprintf(" max segment size: %" B_PRIu32 "\n", fSendMaxSegmentSize); 2423 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", fSendQueue.Used(), 2424 fSendQueue.Size()); 2425 #if DEBUG_BUFFER_QUEUE 2426 fSendQueue.Dump(); 2427 #endif 2428 kprintf(" last acknowledge sent: %" B_PRIu32 "\n", 2429 fLastAcknowledgeSent.Number()); 2430 kprintf(" initial sequence: %" B_PRIu32 "\n", 2431 fInitialSendSequence.Number()); 2432 kprintf(" receive\n"); 2433 kprintf(" window shift: %" B_PRIu8 "\n", fReceiveWindowShift); 2434 kprintf(" next: %" B_PRIu32 "\n", fReceiveNext.Number()); 2435 kprintf(" max advertised: %" B_PRIu32 "\n", 2436 fReceiveMaxAdvertised.Number()); 2437 kprintf(" window: %" B_PRIu32 "\n", fReceiveWindow); 2438 kprintf(" max segment size: %" B_PRIu32 "\n", fReceiveMaxSegmentSize); 2439 kprintf(" queue: %" B_PRIuSIZE " / %" B_PRIuSIZE "\n", 2440 fReceiveQueue.Available(), fReceiveQueue.Size()); 2441 #if DEBUG_BUFFER_QUEUE 2442 fReceiveQueue.Dump(); 2443 #endif 2444 kprintf(" initial sequence: %" B_PRIu32 "\n", 2445 fInitialReceiveSequence.Number()); 2446 kprintf(" duplicate acknowledge count: %" B_PRIu32 "\n", 2447 fDuplicateAcknowledgeCount); 2448 kprintf(" round trip time: %" B_PRId32 " (deviation %" B_PRId32 ")\n", 2449 fRoundTripTime, fRoundTripDeviation); 2450 kprintf(" retransmit timeout: %" B_PRId64 "\n", fRetransmitTimeout); 2451 kprintf(" congestion window: %" B_PRIu32 "\n", fCongestionWindow); 2452 kprintf(" slow start threshold: %" B_PRIu32 "\n", fSlowStartThreshold); 2453 } 2454 2455